---
title: "Replication package for: Reviewing Assessment Tools for Measuring Country Statistical Capacity"
author: "Brian Stacy"
date: "3/4/2024"
format: 
  docx:
    reference-doc: custom-reference-doc.docx
    number-sections: false
    keep_md: true
execute:
  echo: false
  warning: false
dpi: 400
knitr:
  opts_chunk:
    fig.path: ".
    
    /figures/"
    dev: 'tiff'
jupyter: python3
---



```{r}
#| label: setup

library(tidyverse)
library(flextable)
library(here)
library(ggthemes)
library(Hmisc)
library(httr)
library(patchwork)
library(ggrepel)
library(haven)
library(zoo)
library(estimatr)
library(ggpmisc)
library(ggthemes)
library(ggtext)
library(modelsummary)
library(readxl)
library(extrafont)
library(fixest) 
library(fmsb)
library(lubridate)
library(forecast)
## Kseniya`s inputs 
library(writexl)


dir <- here()
raw_dir <- paste0(dir, "/01_raw_data/")
output_dir <- paste(dir, '03_output', sep="/")
#weights (either unity (1) or population)
wgt <- 1
end_date=2023

#specification taking either latest year of data available for indices, 2 year, 3 year, or 5 year averages of indices and SDGs for correlations.  options: latest, 2yr, 3yr, 5yr
specification <- "2yr"

```

```{r}
#| label: fun

#ggplot theme
theme_spi <- function () { 
    theme_bw() %+replace%
    theme(
      plot.caption = element_text(hjust = 0),
      plot.title=element_blank() #remove all titles from plots (sometimes we may need to bring title outside plot)
    )
}
# define stle for ggplot based on BBC plotting styles
bbc_style <- function() {
  font <- "Helvetica"
  
  ggplot2::theme(
    
    #Text format:
    #This sets the font, size, type and colour of text for the chart's title
    plot.title = ggplot2::element_text(family=font,
                                       size=32,
                                       face="bold",
                                       color="#222222"),
    #This sets the font, size, type and colour of text for the chart's subtitle, as well as setting a margin between the title and the subtitle
    plot.subtitle = ggplot2::element_text(family=font,
                                          size=20,
                                          margin=ggplot2::margin(9,0,9,0)),
    plot.caption = ggplot2::element_blank(),
    #This leaves the caption text element empty, because it is set elsewhere in the finalise plot function
    
    #Legend format
    #This sets the position and alignment of the legend, removes a title and backround for it and sets the requirements for any text within the legend. The legend may often need some more manual tweaking when it comes to its exact position based on the plot coordinates.
    legend.position = "top",
    legend.text.align = 0,
    legend.background = ggplot2::element_blank(),
    legend.title = ggplot2::element_blank(),
    legend.key = ggplot2::element_blank(),
    legend.text = ggplot2::element_text(family=font,
                                        size=18,
                                        color="#222222"),
    
    #Axis format
    #This sets the text font, size and colour for the axis test, as well as setting the margins and removes lines and ticks. In some cases, axis lines and axis ticks are things we would want to have in the chart - the cookbook shows examples of how to do so.
    axis.title = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(family=font,
                                      size=14,
                                      color="#222222"),
    axis.text.x = ggplot2::element_text(margin=ggplot2::margin(5, b = 10)),
    axis.ticks = ggplot2::element_blank(),
    axis.line = ggplot2::element_blank(),
    
    #Grid lines
    #This removes all minor gridlines and adds major y gridlines. In many cases you will want to change this to remove y gridlines and add x gridlines. The cookbook shows you examples for doing so
    panel.grid.minor = ggplot2::element_blank(),
    panel.grid.major.x = ggplot2::element_line(color="#cbcbcb"),
    panel.grid.major.y = ggplot2::element_blank(),
    
    #Blank background
    #This sets the panel background as blank, removing the standard grey ggplot background colour from the plot
    panel.background = ggplot2::element_blank(),
    
    #Strip background (#This sets the panel background for facet-wrapped plots to white, removing the standard grey ggplot background colour and sets the title size of the facet-wrap title to font size 22)
    strip.background = ggplot2::element_rect(fill="white"),
    strip.text = ggplot2::element_text(size  = 22,  hjust = 0)
  )
}

geom_curve_polar <- function(...) {
  layer <- geom_curve(...)
  new_layer <- ggproto(NULL, layer)
  old_geom <- new_layer$geom
  geom <- ggproto(
    NULL, old_geom,
    draw_panel = function(data, panel_params, coord, 
                          curvature = 0.5, angle = 90, ncp = 5,
                          arrow = NULL, arrow.fill = NULL,
                          lineend = "butt", linejoin = "round",
                          na.rm = FALSE) {
      data <- ggplot2:::remove_missing(
        data, na.rm = na.rm, c("x", "y", "xend", "yend", 
                               "linetype", "size", "shape")
      )
      if (ggplot2:::empty(data)) {
        return(zeroGrob())
      }
      coords <- coord$transform(data, panel_params)
      ends <- transform(data, x = xend, y = yend)
      ends <- coord$transform(ends, panel_params)
      
      arrow.fill <- if (!is.null(arrow.fill)) arrow.fill else coords$colour
      return(grid::curveGrob(
        coords$x, coords$y, ends$x, ends$y,
        default.units = "native", gp = grid::gpar(
          col = alpha(coords$colour, coords$alpha),
          fill = alpha(arrow.fill, coords$alpha),
          lwd = coords$size * .pt,
          lty = coords$linetype,
          lineend = lineend,
          linejoin = linejoin
        ),
        curvature = curvature, angle = angle, ncp = ncp,
        square = FALSE, squareShape = 1, inflect = FALSE, open = TRUE,
        arrow = arrow
      ))
      
    }
  )
  new_layer$geom <- geom
  return(new_layer)
}

gg_bars <- function(z) {
  
  #set color pallete
  col_pal <- c("#ff595e","#ffca3a","#8ac926","#1982c4","#6a4c93")  
  names(col_pal) <- c(str_wrap("Data Use",10), str_wrap("Data Services",10), str_wrap("Data Products",10), str_wrap("Data Sources",10), str_wrap("Data Infrastructure",10))
  
  
  z <- z
  z <- na.omit(z)
  x <- factor(x=c(1,2,3,4,5), labels=c(str_wrap("Data Use",10), str_wrap("Data Services",10), str_wrap("Data Products",10), str_wrap("Data Sources",10), str_wrap("Data Infrastructure",10)))
  z <- data.frame(x = x, z = z, w = z < 0)
  ggplot(z, aes(x = x, y = z)) +
    geom_col(show.legend = FALSE, fill='#7A8776') +
    geom_text(aes(label=scales::percent(z)), nudge_y = .01, size=5) +
    geom_hline(aes(yintercept=0.2)) +
    bbc_style() +
    expand_limits(y=c(0,1)) +
    theme(
      axis.text.x = element_text()
    ) +
    annotate('text', x=1.1, y=0.5, label=str_wrap("Line of Equal Weight",15)) +
    geom_curve(aes(x=0.9,y=0.4, xend=1.2,yend=0.2), curvature=-0.2) 
}

gg_bars2 <- function(z) {
  
  #set color pallete
  col_pal <- c("#ff595e","#ffca3a","#8ac926","#1982c4","#6a4c93")  
  names(col_pal) <- c(str_wrap("Data Use",10), str_wrap("Data Services",10), str_wrap("Data Products",10), str_wrap("Data Sources",10), str_wrap("Data Infrastructure",10))
  
  
  z <- z
  z <- na.omit(z)
  x <- factor(x=c(1,2,3,4,5), labels=c(str_wrap("Data Use",10), str_wrap("Data Services",10), str_wrap("Data Products",10), str_wrap("Data Sources",10), str_wrap("Data Infrastructure",10)))
  z <- data.frame(x = x, z = z, w = z < 0)
  ggplot(z, aes(x = x, y = z)) +
    geom_col(show.legend = FALSE, fill='#7A8776') +
    geom_text(aes(label=scales::percent(z)), nudge_y = .01, size=5) +
    geom_hline(aes(yintercept=0.2)) +
    bbc_style() +
    expand_limits(y=c(0,1)) +
    theme(
      axis.text.x = element_text()
    ) +
    annotate('text', x=1.1, y=0.5, label=str_wrap("Line of Equal Weight",15)) +
    geom_curve(aes(x=0.9,y=0.4, xend=1.2,yend=0.2), curvature=-0.2) 
}

gg_polar <- function(z) {
  
    #set color pallete
  col_pal <- c("#ff595e","#ffca3a","#8ac926","#1982c4","#6a4c93")  
  names(col_pal) <- c(str_wrap("Data Use",10), str_wrap("Data Services",10), str_wrap("Data Products",10), str_wrap("Data Sources",10), str_wrap("Data Infrastructure",10))
  
  z <- z
  z <- na.omit(z)
  x <- factor(x=c(1,2,3,4,5), labels=c(str_wrap("Data Use",10), str_wrap("Data Services",10), str_wrap("Data Products",10), str_wrap("Data Sources",10), str_wrap("Data Infrastructure",10)))
  z <- data.frame(x = x, z = z, w = z < 0)
  ggplot(z, aes(x = x, y = z, fill = x)) +
    geom_col(show.legend = FALSE) +
    geom_text(aes(y=0.35,label=scales::percent(z)), nudge_y = .01) +
    geom_text(aes(y=0.6,label=x), nudge_y = .01, fontface="bold", font="TT Arial") +
    geom_hline(aes(yintercept=0.2)) + 
    #geom_hline(aes(yintercept=0.4)) + 
    #geom_hline(aes(yintercept=0.8)) + 
    geom_vline(aes(xintercept=0.5)) +
    geom_vline(aes(xintercept=1.5)) +
    geom_vline(aes(xintercept=2.5)) +
    geom_vline(aes(xintercept=3.5)) +
    geom_vline(aes(xintercept=4.5)) +
      scale_fill_manual(
        values=col_pal,
        na.value='grey'
      ) +    
    theme_void() +
    expand_limits(y=c(0,1)) +
    theme(
      axis.text.y = element_blank(),
    #Text format:
    #This sets the font, size, type and colour of text for the chart's title
    plot.title = ggplot2::element_text(family="TT Arial",
                                       size=20,
                                       face="bold",
                                       color="#222222"),
    #This sets the font, size, type and colour of text for the chart's subtitle, as well as setting a margin between the title and the subtitle
    plot.subtitle = ggplot2::element_text(family="TT Arial",
                                          size=22,
                                          margin=ggplot2::margin(9,0,9,0)),
    plot.caption = ggplot2::element_blank(),
    #This leaves the caption text element empty, because it is set elsewhere in the finalise plot function
    
    #Legend format
    #This sets the position and alignment of the legend, removes a title and backround for it and sets the requirements for any text within the legend. The legend may often need some more manual tweaking when it comes to its exact position based on the plot coordinates.
    legend.position = "top",
    legend.text.align = 0,
    legend.background = ggplot2::element_blank(),
    legend.title = ggplot2::element_blank(),
    legend.key = ggplot2::element_blank(),
    legend.text = ggplot2::element_text(family="TT Arial",
                                        size=18,
                                        color="#222222"),
    
    #Axis format
    #This sets the text font, size and colour for the axis test, as well as setting the margins and removes lines and ticks. In some cases, axis lines and axis ticks are things we would want to have in the chart - the cookbook shows examples of how to do so.
    axis.title = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(family="TT Arial",
                                      size=8,
                                      color="#222222"),
    axis.text.x = ggplot2::element_blank(),
    axis.ticks = ggplot2::element_blank(),
    axis.line = ggplot2::element_blank(),
    
    #Grid lines
    #This removes all minor gridlines and adds major y gridlines. In many cases you will want to change this to remove y gridlines and add x gridlines. The cookbook shows you examples for doing so
    panel.grid.minor = ggplot2::element_blank(),
    #panel.grid.major.x = ggplot2::element_line(color="#cbcbcb"),
    panel.grid.major.y = ggplot2::element_blank(),
    
    #Blank background
    #This sets the panel background as blank, removing the standard grey ggplot background colour from the plot
    panel.background = ggplot2::element_blank(),
    
    #Strip background (#This sets the panel background for facet-wrapped plots to white, removing the standard grey ggplot background colour and sets the title size of the facet-wrap title to font size 22)
    strip.background = ggplot2::element_rect(fill="white"),
    strip.text = ggplot2::element_text(size  = 22,  hjust = 0)      
    ) + 
    coord_polar() +
    # annotate('text', x=1.5, y=0.6, label=str_wrap("Line of Equal Weight",10)) +
    # geom_curve_polar(aes(x=1.4,y=0.5, xend=1.6,yend=0.2), curvature=-0.2,
    #                  arrow = arrow(length = unit(0.03, "npc")))     +      # Zoom in & cut off values
   ylim(0, 0.65)

}

load(paste0(raw_dir, '/misc/maps.Rdata'))
standard_crop_wintri <- function() {
  l <- list(
    left=-12000000, right=16396891,
    top=9400000, bottom=-6500000
  )
  l$xlim <- c(l$left, l$right)
  l$ylim <- c(l$bottom, l$top)
  l
}
country_metadata <- read_csv(paste0(raw_dir, "country_metadata.csv"))

spi_mapper <- function(data, indicator, title) {
  
 indicator<-indicator
   map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(iso3c, date, data_available, weights) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))     
  spi_groups_quantiles <- quantile(map_df$data_available, probs=c(1,2,3,4)/5,na.rm=T)
  
  SPI_map <- map_df %>%
    mutate(spi_groups=case_when(
      between(data_available, spi_groups_quantiles[4],100) ~ "Top 20%",
      between(data_available, spi_groups_quantiles[3],spi_groups_quantiles[4]) ~ "4th Quintile",
      between(data_available, spi_groups_quantiles[2],spi_groups_quantiles[3]) ~ "3rd Quintile",
      between(data_available, spi_groups_quantiles[1],spi_groups_quantiles[2]) ~ "2nd Quintile",
      between(data_available, 0,spi_groups_quantiles[1]) ~ "Bottom 20%"
      
    )) %>%
    mutate(spi_groups=factor(spi_groups, 
                             levels=c("Top 20%","4th Quintile","3rd Quintile","2nd Quintile","Bottom 20%" )))  
  
  #set color pallete
  col_pal <- c("#2ec4b6","#acece7","#f1dc76","#ffbf69","#ff9f1c")  
  names(col_pal) <- c("Top 20%","4th Quintile","3rd Quintile","2nd Quintile","Bottom 20%" )
  
  p1<-ggplot() +
    geom_map(data = SPI_map, aes(map_id = iso3c, fill = spi_groups), map = maps$countries) + 
    geom_polygon(data = maps$disputed, aes(long, lat, group = group, map_id = id), fill = "grey80") + 
    geom_polygon(data = maps$lakes, aes(long, lat, group = group), fill = "white")  +
    geom_path(data = maps$boundaries,
              aes(long, lat, group = group),
              color = "white",
              size = 0.4,
              lineend = maps$boundaries$lineend,
              linetype = maps$boundaries$linetype) +
    scale_x_continuous(expand = c(0, 0), limits = standard_crop_wintri()$xlim) +
    scale_y_continuous(expand = c(0, 0), limits = standard_crop_wintri()$ylim) +
    scale_fill_manual(
      name='SPI Score',
      values=col_pal,
      na.value='grey'
    ) +
    coord_equal() +
    theme_map(base_size=12) +
    labs(
      title=str_wrap(title,100),
      caption = 'Source: World Bank. Statistical Performance Indicators'
    ) +
    theme(
      text=element_text(size=14)
    )
 print(p1)
}
spi_charts  <- function(data, indicator, title) {
  
 indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))    
  
  #add histogram by region 
  p2 <- map_df %>%
    group_by(region) %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=wtd.mean(data_available, weights = weights, na.rm=T),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=region, fill=region)) +
      geom_bar(stat="identity",position='dodge') +
      geom_text(aes(label=Label)) +
      labs(
      title=str_wrap(paste(title, 'By Region', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
    theme(
      text=element_text(size=14)
    )
  
  #by income
    income <- c("Low income", "Lower middle income","Upper middle income","High income")
    p3 <- map_df %>%
    group_by(income_level) %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=wtd.mean(data_available, weights=weights, na.rm=T),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=income, fill=income)) +
      geom_bar(stat="identity",position='dodge') +
      geom_text(aes(label=Label)) +
      labs(
      title=str_wrap(paste(title, 'By Income', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      scale_y_discrete(limits = income) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
    theme(
      text=element_text(size=14)
    )
    
  # #add line graph over time
  p4 <- get(data)  %>%
    rename(data_available=!! indicator) %>%
    # right_join(spi_df_empty) %>%
    group_by( date) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available))) %>%
    mutate(`SPI Score`=mean(data_available, na.rm=T),
           Label = paste(round(`SPI Score`,0))) %>%
    ungroup() %>%
    ggplot(aes(y=`SPI Score`, x=date)) +
      geom_point() +
      geom_line(fill='blue') +
      # geom_text_repel(aes(label=Label)) +
      labs(
      title=str_wrap(paste(title, 'By Date', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.'
      ) +
    
      expand_limits(y=c(0,100)) +
      theme_spi() +
    theme(
      text=element_text(size=14)
    )
      
 (p2 / p3) 
    
}
spi_country_charts  <- function(data, indicator, title) {
  
 indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    filter(region!="Aggregates") %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))    
  
   spi_groups_quantiles <- quantile(map_df$data_available, probs=c(1,2,3,4)/5,na.rm=T)
  
  SPI_map <- map_df %>%
    mutate(spi_groups=case_when(
      between(data_available, spi_groups_quantiles[4],100) ~ "Top 20%",
      between(data_available, spi_groups_quantiles[3],spi_groups_quantiles[4]) ~ "4th Quintile",
      between(data_available, spi_groups_quantiles[2],spi_groups_quantiles[3]) ~ "3rd Quintile",
      between(data_available, spi_groups_quantiles[1],spi_groups_quantiles[2]) ~ "2nd Quintile",
      between(data_available, 0,spi_groups_quantiles[1]) ~ "Bottom 20%"
      
    )) %>%
    mutate(spi_groups=factor(spi_groups, 
                             levels=c("Top 20%","4th Quintile","3rd Quintile","2nd Quintile","Bottom 20%" )))  
  
  #set color pallete
  col_pal <- c("#2ec4b6","#acece7","#f1dc76","#ffbf69","#ff9f1c")  
  names(col_pal) <- c("Top 20%","4th Quintile","3rd Quintile","2nd Quintile","Bottom 20%" )
  
  p2_alt <- SPI_map %>%
    ungroup() %>%
    ggplot(aes(x=data_available, y=region, color=spi_groups)) +
      geom_point() +
      geom_text(aes(label=country), position=position_jitter(width=.1,height=.4), check_overlap=T) +
      labs(
      title=str_wrap(paste(title, 'By Country', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      xlab('Score') +
      expand_limits(x=c(0,100)) +
      scale_color_manual(
        name='SPI Score',
        values=col_pal,
        na.value='grey'
      ) +
      theme_spi() +
      theme(legend.position = 'top',
            text=element_text(size=14)
            ) 
p2_alt
}


spi_region_charts  <- function(data, indicator, title) {
  
    map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(iso3c, date, data_available, weights) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))  
    
    
    
    
  #add histogram by region 
  region_SPI_df <- map_df %>%
    group_by(region) %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=wtd.mean(data_available, weights = weights, na.rm=T),
           Label = paste(round(`SPI Score`,0))) %>%
    arrange(-`SPI Score`) 
  
  region_order <- c("Sub-Saharan Africa" ,
                    "South Asia",
                    "North America",
                    "Middle East & North Africa",
                    "Latin America & Caribbean",
                    "Europe & Central Asia", 
                    "East Asia & Pacific"
                           )

  region_SPI_df <- region_SPI_df %>%
    mutate(region=factor(region, levels=region_order))
  
   p2 <-  ggplot(region_SPI_df, aes(x=`SPI Score`, y=region, fill=region)) +
      geom_bar(stat="identity",position='dodge') +
      geom_text(aes(label=Label)) +
      labs(
      title=str_wrap(paste(title, 'By Region', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= paste0('Based on data for ',end_date,' or the latest year available')
      ) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
      theme(legend.position = 'top')


  

  print(p2)
  


    
}




income_charts <- function(data, indicator, title) { 
 indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))    
  
  income <- c("Low income", "Lower middle income","Upper middle income","High income")
  p2_alt3 <- map_df %>%
    ungroup() %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=(data_available),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=income_level, color=income_level)) +
      geom_point() +
      geom_text(aes(label=country), position=position_jitter(width=.1,height=.4), check_overlap=T) +
      labs(
      title=str_wrap(paste(title, 'By Income', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      scale_y_discrete(limits = income) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
      theme(legend.position = 'top',
            text=element_text(size=14) )
   
p2_alt3 
  
 
}
spi_income_aggregates <- function(data, indicator, title) {
   indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))   
  
    income <- c("Low income", "Lower middle income","Upper middle income","High income")
    p3 <- map_df %>%
    group_by(income_level) %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=wtd.mean(data_available, weights=weights, na.rm=T),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=income_level, fill=income_level)) +
      geom_bar(stat="identity",position='dodge') +
      geom_text(aes(label=Label)) +
      labs(
      title=str_wrap(paste(title, 'By Income', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      scale_y_discrete(limits = income) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
    theme(
      text=element_text(size=14)
    )
    
    p3
}
lending_charts <- function(data, indicator, title) { 
 indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))    
  
  lending_list <- c("Not classified", "IBRD", "Blend", "IDA" )
  
  p2_alt3 <- map_df %>%
    ungroup() %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=(data_available),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=lending, color=lending)) +
      geom_point() +
      geom_text(aes(label=country), position=position_jitter(width=.1,height=.4), check_overlap=T) +
      labs(
      title=str_wrap(paste(title, 'By Lending Status', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      scale_y_discrete(limits = lending_list) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
      theme(legend.position = 'top',
            text=element_text(size=14) )
   
p2_alt3 
  
 
}
lending_chart_aggregate <- function(data, indicator, title) { 
 indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))    
  
  lending_list <- c("Not classified", "IBRD", "Blend", "IDA" )
  
  
  p2_alt3 <- map_df %>%
    group_by(lending) %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=wtd.mean(data_available, weights = weights, na.rm=T),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=lending, fill=lending)) +
      geom_bar(stat="identity",position='dodge') +
      geom_text(aes(label=Label)) +
      labs(
      title=str_wrap(paste(title, 'By Lending Status', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      scale_y_discrete(limits = lending_list) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
      theme(legend.position = 'top',
            text=element_text(size=14) )
   
p2_alt3 
  
 
}
fcs_charts <- function(data, indicator, title) { 
  #FY21 Fragile and conflict-affected situations (http://pubdocs.worldbank.org/en/888211594267968803/FCSList-FY21.pdf)
  high_intensity_conflict <- c('Afghanistan', 'Libya', 'Somalia', 'Syrian Arab Republic' )
  
  medium_intensity_conflict <- c('Burkina Faso', 'Cameroon','Central African Republic', 'Chad', 'Congo, Dem. Rep.',
                                 'Iraq','Mali','Mozambique','Myanmar','Niger','Nigeria','South Sudan','Yemen, Rep.')
  
  high_institutional_social_fragility <- c('Burundi','Congo, Rep.','Eritrea','Gambia, The','Guinea-Bissau',
                                           'Haiti','Kosovo','Lao PDR','Lebanon','Liberia','Papua New Guinea',
                                           'Sudan','Venezuela, RB','West Bank and Gaza','Zimbabwe')
  
  small_states <- c('Comoros','Kiribati','Marshall Islands','Micronesia, Fed. Sts.','Solomon Islands','Timor-Leste','Tuvalu')
  
 indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    mutate(fcs=case_when( #create indicators for Fragile and Conflict-affected Situations
      country %in% high_intensity_conflict ~ "FCS country",
      country %in% medium_intensity_conflict ~ "FCS country",
      country %in% high_institutional_social_fragility ~ "FCS country",
      country %in% small_states ~ "FCS country",
      TRUE ~ "Non-FCS country"
    )) %>%
    mutate(fcs_detail=case_when( #create indicators for Fragile and Conflict-affected Situations
      country %in% high_intensity_conflict ~ "High-Intensity Conflict",
      country %in% medium_intensity_conflict ~ "Medium-Intensity Conflict",
      country %in% high_institutional_social_fragility ~ "High Institutional & Social Fragility",
      country %in% small_states ~ "Small States",
      TRUE ~ "Non-FCS country"
    )) %>%
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country,fcs,fcs_detail, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))    
  
  fcs_list <- c("High-Intensity Conflict", "Medium-Intensity Conflict","High Institutional & Social Fragility","Small States","Non-FCS country")
  fcs_list <- c("FCS country","Non-FCS country")
  p2_alt2 <- map_df %>%
    ungroup() %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=(data_available),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=fcs, color=fcs)) +
      geom_point() +
      geom_text(aes(label=country), position=position_jitter(width=.1,height=.4), check_overlap=T) +
      labs(
      title=str_wrap(paste(title, 'By Fragile and Conflict-affected Situations (FCS)', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      scale_y_discrete(limits = fcs_list) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
      theme(legend.position = 'top',
            text=element_text(size=14)) 
   
p2_alt2 
  
 
}
fcs_chart_aggregate <- function(data, indicator, title) { 
  #FY21 Fragile and conflict-affected situations (http://pubdocs.worldbank.org/en/888211594267968803/FCSList-FY21.pdf)
  high_intensity_conflict <- c('Afghanistan', 'Libya', 'Somalia', 'Syrian Arab Republic' )
  
  medium_intensity_conflict <- c('Burkina Faso', 'Cameroon','Central African Republic', 'Chad', 'Congo, Dem. Rep.',
                                 'Iraq','Mali','Mozambique','Myanmar','Niger','Nigeria','South Sudan','Yemen, Rep.')
  
  high_institutional_social_fragility <- c('Burundi','Congo, Rep.','Eritrea','Gambia, The','Guinea-Bissau',
                                           'Haiti','Kosovo','Lao PDR','Lebanon','Liberia','Papua New Guinea',
                                           'Sudan','Venezuela, RB','West Bank and Gaza','Zimbabwe')
  
  small_states <- c('Comoros','Kiribati','Marshall Islands','Micronesia, Fed. Sts.','Solomon Islands','Timor-Leste','Tuvalu')
  
 indicator<-indicator
  map_df <- get(data) %>%
    filter(date==max(date, na.rm=T)) %>%
    filter(!(country %in% c('Greenland'))) %>% #drop a few countries for which we do not collect data.
    mutate(fcs=case_when( #create indicators for Fragile and Conflict-affected Situations
      country %in% high_intensity_conflict ~ "FCS country",
      country %in% medium_intensity_conflict ~ "FCS country",
      country %in% high_institutional_social_fragility ~ "FCS country",
      country %in% small_states ~ "FCS country",
      TRUE ~ "Non-FCS country"
    )) %>%
    mutate(fcs_detail=case_when( #create indicators for Fragile and Conflict-affected Situations
      country %in% high_intensity_conflict ~ "High-Intensity Conflict",
      country %in% medium_intensity_conflict ~ "Medium-Intensity Conflict",
      country %in% high_institutional_social_fragility ~ "High Institutional & Social Fragility",
      country %in% small_states ~ "Small States",
      TRUE ~ "Non-FCS country"
    )) %>%
    group_by( country) %>%
    #summarise(across(!! indicator,last)) %>%
    rename(data_available=!! indicator) %>%
    select(country,fcs,fcs_detail, date, data_available, weights ) %>%
    right_join(country_metadata) %>%
    mutate(data_available=if_else(is.na(data_available), as.numeric(NA), as.numeric(data_available)))    
  
  fcs_list <- c("High-Intensity Conflict", "Medium-Intensity Conflict","High Institutional & Social Fragility","Small States","Non-FCS country")
  fcs_list <- c("FCS country","Non-FCS country")
  p2_alt2 <- map_df %>%
    group_by(fcs) %>%
    filter(region!='Aggregates') %>%
    mutate(`SPI Score`=wtd.mean(data_available, weights = weights, na.rm=T),
           Label = paste(round(`SPI Score`,0))) %>%
    ggplot(aes(x=`SPI Score`, y=fcs, fill=fcs)) +
      geom_bar(stat="identity",position='dodge') +
      geom_text(aes(label=Label)) +
      labs(
      title=str_wrap(paste(title, 'By Fragile and Conflict-affected Situations (FCS)', sep=" - "),100),
      caption = 'Source: World Bank. Statistical Performance Indicators.',
      subtitle= 'Based on data for 2023 or the latest year available'
      ) +
      scale_y_discrete(limits = fcs_list) +
      expand_limits(x=c(0,100)) +
      theme_spi() +
      theme(legend.position = 'top',
            text=element_text(size=14)) 
  
  
  
p2_alt2 
  
 
}
FitFlextableToPage <- function(ft, pgwidth = 6){
  ft_out <- ft %>% autofit()
  ft_out <- width(ft_out, width = dim(ft_out)$widths*pgwidth /(flextable_dim(ft_out)$widths))
  return(ft_out)
}
#add equations to plots
eq_plot_txt <- function(data, inp, var) {
  eq <- lm_robust(inp ~ var, data=data, se_type='HC2')
  coef <- round(coef(eq),1)
  std_err <- round(sqrt(diag(vcov(eq))),1)
  r_2<- round(summary(eq)$r.squared,2)
  sprintf(" y = %.1f + %.1f x, R<sup>2</sup> = %.2f <br> (%.1f) <span style='color:white'> %s</span> (%.1f) ", coef[1], coef[2], r_2[1], std_err[1],"s", std_err[2])
  
}

```

```{r}
#| label: dataread

#import data specificially about the SPI
spi_index_df <- read_csv(paste(raw_dir, 'SPI_index.csv', sep="/")) 


spi_df_final <- read_csv( file = paste(raw_dir, 'SPI_data.csv', sep="/")) 

#metadata 
metadata <- data.frame(
  series = c('SPI.INDEX', 'SPI.INDEX.PIL1', 'SPI.INDEX.PIL2', 'SPI.INDEX.PIL3', 'SPI.INDEX.PIL4', 'SPI.INDEX.PIL5'),
  indicator_name = c('SPI Overall Score', 'Pillar 1: Data Use', 'Pillar 2: Data Services', 'Pillar 3: Data Products', 'Pillar 4: Data Sources', 'Pillar 5: Data Infrastructure')
)

SPI_end_date <- spi_index_df %>%
  filter(date==end_date) %>%
  filter(!is.na(SPI.INDEX) & !is.na(weights)) %>%
  mutate(ISO_A3_EH=iso3c) 

SPI <- spi_index_df



## Read in data produced in ./02_programs/misc/spi_lit_review_data_preparation.Rmd
fig1_df <- read_csv(paste0(output_dir, "/Figure_1_data.csv"))

#read in data comparing SPI to other indices and SDGs
if (specification=="latest") {
  comparison_df <- read_csv(paste0(output_dir, "/SPI_Index_SDG_comparisons_data.csv"))

} else if (specification=="2yr"){
  comparison_df <- read_csv(paste0(output_dir, "/SPI_Index_SDG_comparisons_data_2yr_avg.csv"))
} else if (specification=="3yr"){
  comparison_df <- read_csv(paste0(output_dir, "/SPI_Index_SDG_comparisons_data_3yr_avg.csv"))
} else if (specification=="5yr"){
  comparison_df <- read_csv(paste0(output_dir, ".csv"))
}

#read in regression data with covariates
predictors_df <- read_csv(paste0(output_dir, "/SPI_regression_predictors.csv"))

```




## Table 1. Comparing the SPI and Other Data and Statistics Indexes

Table produced using Microsoft Word. Numbers come from research on each index.

## Figure 1. Number of Household Surveys vs. Country Income, 1982- 2023

```{r}
#| label: incomesurveys
#| fig-width: 9
#| fig-height: 6


#read in data from World Bank's PIP
pip_df <- read_csv(paste0(output_dir, "/Figure_1_data.csv")) %>%
  mutate(log_welfare = log(mean_welfare)) # create logged welfare for figure


#plot number of surveys against mean welfare in 2023.
ggplot(pip_df, aes(x=mean_welfare, y=number_surveys)) +
  geom_point(color='blue') +
  geom_smooth(method='lm', se=FALSE, color='darkred') +
  theme_spi() +
  #scale_x_continuous(trans = scales::log2_trans()) +
  ylab('Number of Poverty Surveys') +
  xlab('$ Mean Welfare (logged scale)') +
  geom_richtext(
    aes(x = 2, y = 45,label = eq_plot_txt(pip_df, number_surveys, log_welfare), hjust=0.2)
  ) +
  scale_x_log10(
    label=scales::comma
  ) +
  labs(
    caption = str_wrap('Note: Estimated coefficients are shown from an OLS regression of the number of surveys on log of mean consumption (income); robust standard errors are in parentheses.',100)
  ) +
  #annotate("label", x = 20, y = 5, label = eq_plot_txt(pov_df_2023), hjust = 0) +
  theme(legend.position = 'bottom'
        )

```

**Note**: This figure employs data from the World Bank's PIP database.

## Figure 2. Mapping Other Data Tools to SPI Framework

```{r}
#| label: frameworkdf
table_2_df <- read_excel(path = paste0(raw_dir, "/SPI_Index_compare.xlsx"))

dat <- table_2_df %>%
  select(Index, `Assessment Type`, `Country Coverage`, `Time Coverage`, `Index Methodology`, `Number of Indicators`, ends_with("Percentage")) %>%
  pivot_longer(cols=ends_with("Percentage"),
               names_to = 'Pillar',
               values_to = 'Pillar Coverage') %>%
  mutate(Pillar=str_remove_all(Pillar, " Percentage"),
         Pillar=str_remove_all(Pillar, "Data "),
         short_name=case_when(
           Index=="SPI" ~ "SPI",
           Index=="SCI" ~ "SCI",
           Index=="ODIN" ~ "ODIN",
           Index=="Open Data Barometer" ~ "ODB",
           Index=="Global Data Barometer" ~ "GDB",
           Index=="Ibrahim Index of African Governance Statistical Capacity Measure" ~ "IIAG",
           Index=="EU Snapshot tool"    ~ "Snapshot",
           Index=="UN NQAF self checklist"  ~ "NQAF" ,                     
           Index=="Paris21 NSDS self assessment" ~ "NSDS"
         ),
         Index=case_when(
           Index=="SPI" ~ "SPI (2023)",
           Index=="SCI" ~ "SCI (2020)",
           Index=="ODIN" ~ "ODIN (2022)",
           Index=="Open Data Barometer" ~ "Open Data Barometer (2017)",
           Index=="Global Data Barometer" ~ "Global Data Barometer (2021)",
           Index=="Ibrahim Index of African Governance Statistical Capacity Measure" ~ "Ibrahim Index of African Governance Statistical Capacity Measure (2023)",
           Index=="EU Snapshot tool"    ~ "EU Snapshot tool",
           Index=="UN NQAF self checklist"  ~ "UN NQAF self checklist" ,                     
           Index=="Paris21 NSDS self assessment" ~ "Paris21 NSDS self assessment"          
         )) 
```

```{r echo=FALSE, fig.height=15, fig.width=15, message=FALSE, warning=FALSE}
radarfun <- function(variables) {
    temp <- dat %>%
    filter(Index== variables) %>%
    select(Pillar, `Pillar Coverage`) %>%
    mutate(`Pillar`=str_remove(`Pillar`, " Indicator")) %>%
    pivot_wider(
      names_from=Pillar,
      values_from=`Pillar Coverage`
    ) 
}


  min <- data.frame(  
    Use          = c(0) ,
    Services     = c(0) ,  
    Products     = c(0) ,  
    Sources      = c(0) , 
    Infrastructure = c(0) 
  )

  max <- data.frame(  
    Use          = c(1) ,
    Services     = c(1) ,  
    Products     = c(1) ,  
    Sources      = c(1) , 
    Infrastructure = c(1) 
  )  
  
  ideal <-    data.frame(  
    Use          = c(0.2) ,
    Services     = c(0.2) ,  
    Products     = c(0.2) ,  
    Sources      = c(0.2) , 
    Infrastructure = c(0.2) 
  )  
  


  color <- c("#33a8c7",
             "#52e3e1",
             "#a0e426",
             "#fdf148",
             "#ffab00",
             "#f77976",
             "#f050ae",
             "#bdb2ff",
             "#d883ff",
             "#9336fd")
  
  names(color) <- unique(dat$Index)
  
radarplt <- function(variables, color = "#00AFBB", 
                                        vlabels = colnames(data), vlcex = 0.7,
                                        caxislabels = NULL,  ...) {
  if (variables=="Ibrahim Index of African Governance Statistical Capacity Measure (2023)") {
    title="Ibrahim Index (2023)"
  } else {
    title=variables
  }
  

  max %>%
    bind_rows(min) %>%
    bind_rows(ideal) %>%    
    bind_rows(radarfun(variables)) %>%
    radarchart(axistype = 1,
    # Customize the polygon
    pcol = c('#83c5be',color), pfcol = c(scales::alpha('#83c5be', 0.5),scales::alpha(color, 0.7)), plwd = 2, plty = 1,
    # Customize the grid
    cglcol = "grey", cglty = 1, cglwd = 0.8,cex.main=2,
    # Customize the axis
    axislabcol = "grey", 
    # Variable labels
    vlcex = vlcex, vlabels = vlabels,
    caxislabels = caxislabels, title = title, ...)
}  
  


```

```{r echo=FALSE, fig.height=15, fig.width=15, message=FALSE, warning=FALSE}
#| label: fig2
#| fig-width: 15
#| fig-height: 15
#| 

par(mar = c(1, 1, 1, 1)) #decrease default margin
layout(matrix(1:9, ncol=3)) #draw 4 plots to device
#loop over rows to draw them, add 1 as max and 0 as min for each var
lapply(unique(dat$Index), function(i) { 
    
    radarplt(i, color=color[i],
             vlcex=1.1,
             calcex=1.1)
})
```

```{r}
#| label: volatilitydf1
odin_df <- read_csv(paste0(output_dir, "/ODIN_formatted_data.csv"))

#combine spi with odin
volatility_df <- SPI %>%
  left_join(odin_df) %>%
  select(country, iso3c, date,income, region, SPI.INDEX, ODIN_score) 
           
  
```





## Figure 3. Scatter plot and Trends of ODIN and SPI Overall Scores for 2016 and 2022

```{r}
#| label: volatilityplt12

volatility_scatter_df <- volatility_df %>%
  select(-country, income, region) %>% #avoid conflicts with naming/income groups
  filter(date==2016 | date==2022) %>%
  as_tibble() %>%
  pivot_longer(
    cols=c('SPI.INDEX', 'ODIN_score'),
    names_to='indicator',
    values_to='value'
  ) %>%
  pivot_wider(
    values_from='value',
    names_from='date',
    names_prefix='value_'
  ) %>%
  mutate(indicator=case_when(
    indicator=='ODIN_score' ~ "ODIN Score",
    indicator=="SPI.INDEX" ~ "SPI Overall Score"
  ))

# give a name to a formula
spi_vol_df <- volatility_scatter_df %>%
  filter(indicator=="SPI Overall Score")

odin_vol_df <- volatility_scatter_df %>%
  filter(indicator=="ODIN Score")

#get reg output
data = spi_vol_df
inp = 'value_2022'
var = 'value_2016'

eq <- lm_robust(value_2022 ~ value_2016, data=data, se_type='HC2')
coef <- round(coef(eq),1)
std_err <- round(sqrt(diag(vcov(eq))),1)
r_2<- round(summary(eq)$r.squared,2)
spi_text <- sprintf(" y = %.1f + %.1f x, R<sup>2</sup> = %.2f <br> (%.1f) <span style='color:white'> %s</span> (%.1f) ", coef[1], coef[2], r_2[1], std_err[1],"s", std_err[2])

#odin reg output
data = odin_vol_df
inp = 'value_2022'
var = 'value_2016'

eq <- lm_robust(value_2022 ~ value_2016, data=data, se_type='HC2')
coef <- round(coef(eq),1)
std_err <- round(sqrt(diag(vcov(eq))),1)
r_2<- round(summary(eq)$r.squared,2)
odin_text <- sprintf(" y = %.1f + %.1f x, R<sup>2</sup> = %.2f <br> (%.1f) <span style='color:white'> %s</span> (%.1f) ", coef[1], coef[2], r_2[1], std_err[1],"s", std_err[2])

#plot
p1 <- ggplot(volatility_scatter_df, aes(x=value_2016, y=value_2022,color=indicator, group=indicator)) +
  geom_point() +
  geom_smooth( method='lm', se=FALSE) +
  scale_color_manual(
    values=c(
      'ODIN Score' = "darkred",
      'SPI Overall Score' ="blue" 
    )
  ) +  
  #stat_poly_eq() +
  geom_richtext(
    aes(x = 10, y = 95,label = spi_text, hjust=0.2), colour="blue", size=2
  ) +
  geom_richtext(
    aes(x = 55, y = 15,label = odin_text, hjust=0.2), colour="darkred", size=2
  ) +

  # labs(
  #   title='Volatility Between 2016 and 2020 for SCI and SPI Overall Scores'
  # ) +
  xlab('2016 value') +
  ylab('2022 value') +
  expand_limits(x=c(0,100), y=c(0,100)) +
  theme_spi()
    
```

```{r}
#| label: volatilityplt23
#| fig-height: 8

country_samp <- sample(unique(volatility_df$country),25)

volatility_trace_df <- volatility_df %>%
  as_tibble() %>%
  filter(date>=2016) %>%
  filter(!is.na(SPI.INDEX) & !is.na(ODIN_score)) %>%
  pivot_longer(
    cols=c('SPI.INDEX', 'ODIN_score'),
    names_to='indicator',
    values_to='value'
  )  %>%
  mutate(indicator=case_when(
    indicator=='ODIN_score' ~ "ODIN Score",
    indicator=="SPI.INDEX" ~ "SPI Overall Score"
  ))
  
#randomly seleect a few countries  
set.seed(72456)
country_samp <- sample(unique(volatility_trace_df$country),15)
country_samp <- c("Belgium", "Moldova", "Belarus", "Sao Tome and Principe", "St. Kitts and Nevis", "France",
                  "Cambodia", "Belize", "Ukraine", "Nigeria", "Serbia",
                  "Rwanda", "Lithuania", "Luxembourg", "Romania")
volatility_trace_sample_df <- volatility_trace_df  %>%
  filter(country %in% country_samp)


country_vol_plot <- function(cntry) {
  
  data <- volatility_trace_sample_df %>%
    filter(country==cntry)
  
  ggplot(data, aes(x=date, y=value, color=indicator)) +
    facet_wrap(~country) +
    geom_line( ) +
    geom_point(shape=21, fill="#69b3a2") +
    scale_color_manual(
      values=c(
        'ODIN Score' = "darkred",
        'SPI Overall Score' ="blue" 
      )
    ) +
    guides(color='none') +
    # labs(
    #   title='Volatility for SCI and SPI Overall Scores for 25 Randomly Selected Countries'
    # ) +
    xlab('') +
    ylab('') +
    expand_limits(y=c(0,100)) +
    theme_bw() +
    theme(legend.position = 'none')
}

  
p2 <- country_vol_plot("Belgium"  )             
p3 <- country_vol_plot("Moldova")
p4 <- country_vol_plot("Belarus")
p5 <- country_vol_plot("Sao Tome and Principe")
p6 <- country_vol_plot("St. Kitts and Nevis")
p7 <- country_vol_plot("France")
p8 <- country_vol_plot("Cambodia")
p9 <- country_vol_plot("Belize")
p10 <- country_vol_plot("Ukraine")
p11 <- country_vol_plot("Nigeria")
p12 <- country_vol_plot("Serbia")
p13 <- country_vol_plot("Rwanda" )              
p14 <- country_vol_plot("Lithuania")
p15 <- country_vol_plot("Luxembourg")
p16 <- country_vol_plot("Romania" )



    
```

```{r}
#| label: fig3
#| fig-width: 10
#| fig-height: 13

design <- "
  1234
  5678
  ABCD
  EFGH
"

p1 + p2 + p3 + p4 +  p5 + p6 + p7 + p8 +  p9 + p10 + p11 + p12  +  p13 + p14 + p15 + p16 + 
  plot_layout(nrow=4, guides = 'collect') &
  theme(
    legend.position = 'bottom'
  )

```

```{r}
#get average for romania, serbia, and ukraine as in the paper
vol_num <- volatility_scatter_df <- volatility_df %>%
  filter(iso3c %in% c('ROU', "SRB", "UKR")) %>%
  select(-country, -income, -region) %>% #avoid conflicts with naming/income groups
  as_tibble() %>%
  pivot_longer(
    cols=c('SPI.INDEX', 'ODIN_score'),
    names_to='indicator',
    values_to='value'
  ) %>%
  pivot_wider(
    values_from='value',
    names_from='date',
    names_prefix='value_'
  ) %>%
  mutate(indicator=case_when(
    indicator=='ODIN_score' ~ "ODIN Score",
    indicator=="SPI.INDEX" ~ "SPI Overall Score"
  ))

rou_num1 <- vol_num %>% filter(iso3c=="ROU") %>% filter(indicator=="SPI Overall Score") %>% pluck(7)
rou_num2 <- vol_num %>% filter(iso3c=="ROU") %>% filter(indicator=="SPI Overall Score") %>% pluck(8)
rou_diff <- abs(rou_num2-rou_num1)

srb_num1 <- vol_num %>% filter(iso3c=="SRB") %>% filter(indicator=="SPI Overall Score") %>% pluck(8)
srb_num2 <- vol_num %>% filter(iso3c=="SRB") %>% filter(indicator=="SPI Overall Score") %>% pluck(9)
srb_diff <- abs(srb_num2-srb_num1)

ukr_num1 <- vol_num %>% filter(iso3c=="UKR") %>% filter(indicator=="SPI Overall Score") %>% pluck(5)
ukr_num2 <- vol_num %>% filter(iso3c=="UKR") %>% filter(indicator=="SPI Overall Score") %>% pluck(6)
ukr_diff <- abs(ukr_num2-ukr_num1)

(rou_diff + srb_diff + ukr_diff)/3

```

```{r}
#get the average volatility for SPI overall score and ODIN for all countires

variance_avg_df <- volatility_trace_df %>%   
  group_by(country, indicator) %>%
  summarise(variance=sd(value)) %>%
  pivot_wider(
    names_from='indicator',
    values_from='variance'
  ) %>%
  ungroup() %>%
  summarise(across(c('ODIN Score', 'SPI Overall Score'), mean, na.rm=T)) %>%
  mutate(country="Global Avg")
global_sd_odin <- variance_avg_df$`ODIN Score`[1]
global_sd_spi <- variance_avg_df$`SPI Overall Score`[1]
  
paste0('ODIN std dev: ', global_sd_odin)
paste0('SPI std dev: ', global_sd_spi)
```




## Figure 4. Absolute Value of Correlation between Key SDGs and Indexes

```{r}
#| label: sdgcordf


sdgs <- c('SI.POV.DDAY','SN.ITK.DEFC.ZS','SH.STA.MMRT','SE.LPV.PRIM','SG.LAW.INDX',
          'SH.H2O.SMDW.ZS','EG.ELC.ACCS.ZS','NY.GDP.PCAP.KD','NV.IND.MANF.ZS','SI.POV.GINI',
          'EN.POP.SLUM.UR.ZS','subsidies','EN.GHG.ALL.MT.CE.AR5','ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'
          ,'GE.EST','DT.TDS.DECT.EX.ZS', 'sdg_index_score')

sdg_names <- c('SDG 1: Extreme Poverty','SDG 2: Undernourishment','SDG 3: Maternal Mortality','SDG 4: Learning Poverty','SDG 5: Women, Business, Law Index',
          'SDG 6: Safely Managed Water','SDG 7: Access to Electricity','SDG 8: GDP per capita (2015 constant $)','SDG 9: Manufacturing value added (% of GDP)','SDG 10: Gini Index',
          'SDG 11: Population in Slums','SDG 12: Fossil Fuel Subsidies (% of GDP)','SDG 13: Greenhouse Gas Emissions','SDG 14: Marine protected areas','SDG 15: Terrestrial Protected Areas','SDG 16: Government Effectiveness','SDG 17: Total Debt Service', 'SDR: SDG Index Overall Score')

#get correlations with pvalues
index_cor_full <- comparison_df %>%
  select(c('SPI.INDEX', 'SCI', 'ODIN_score', 'odb', 'gdb'),sdgs) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_full$r
index_cor <- index_cor[6:23,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_full$P
index_sig <- index_sig[6:23,1:5]

rownames(index_cor) <- sdg_names
rownames(index_sig) <- sdg_names 
```

```{r}
#| label: fig4
#| fig-height: 10
#| fig-width: 8


fig <- as_tibble(cbind(index_cor, index_sig,var=sdg_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...6,0.05,0.1) ~   paste0(SPI...1,"*"),
            between(SPI...6,0.01,0.5) ~  paste0(SPI...1,"**"),
            between(SPI...6,0.0,0.01) ~ paste0(SPI...1,"***"),
           is.na(SPI...1) ~ "",
           TRUE ~      paste0(SPI...1,"")
            ),
         SCI=case_when(
           between(SCI...7,0.05,0.1) ~   paste0(SCI...2,"*"),
           between(SCI...7,0.01,0.5) ~  paste0(SCI...2,"**"),
           between(SCI...7,0.0,0.01) ~ paste0(SCI...2,"***"),
           is.na(SCI...2) ~ "",
           TRUE ~      paste0(SCI...2,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...9,0.05,0.1) ~   paste0(ODB...4,"*"),
           between(ODB...9,0.01,0.5) ~  paste0(ODB...4,"**"),
           between(ODB...9,0.0,0.01) ~ paste0(ODB...4,"***"),
           is.na(ODB...4) ~ "",
           TRUE ~      paste0(ODB...4,"")
         ),
         GDB=case_when(
           between(GDB...10,0.05,0.1) ~   paste0(GDB...5,"*"),
           between(GDB...10,0.01,0.5) ~  paste0(GDB...5,"**"),
           between(GDB...10,0.0,0.01) ~ paste0(GDB...5,"***"),
           is.na(GDB...5) ~ "",
           TRUE ~      paste0(GDB...5,"")
         )
    )  %>%
  mutate(var=factor(var, levels=rev(var)))

ggplot(fig, aes(x=var, y=abs(SPI...1))) +
  geom_point(aes(color="SPI", shape="SPI", fill="SPI"), size=4) +
  geom_point(aes(y=abs(SCI...2),color="SCI", shape="SCI", fill="SCI"), size=4) +
  geom_point(aes(y=abs(ODIN...3),color="ODIN", shape="ODIN", fill="ODIN"), size=4) +
  geom_point(aes(y=abs(ODB...4),color="ODB", shape="ODB", fill="ODB"), size=4) +
  geom_point(aes(y=abs(GDB...5),color="GDB", shape="GDB", fill="GDB"), size=4) +
  coord_flip() +
  scale_color_manual(
    values=c(
      "SPI"="#264653",
      "SCI"="#2a9d8f",
      "ODIN" = "#e9c46a",
      "ODB" = "#f4a261",
      "GDB" = "#e76f51"
    )
  ) +
  scale_fill_manual(
    values=c(
      "SPI"="#264653",
      "SCI"="#2a9d8f",
      "ODIN" = "#e9c46a",
      "ODB" = "#f4a261",
      "GDB" = "#e76f51"
    )
  ) +
  scale_shape_manual(
    values=c(
      "SPI"=15,
      "SCI"=6,
      "ODIN" = 17,
      "ODB" = 18,
      "GDB" = 19
    )
  ) +
  xlab("") +
  ylab("") +
  scale_x_discrete(labels=scales::wrap_format(30)) +
  theme_spi() +
  labs(caption=str_wrap("Note: Latest year for the SPI is 2023, for the SCI is 2020, for the ODIN is 2022, for the ODB is 2017, and for the GDB is 2021.",100)) +
  guides(
    fill=guide_legend(' '),
    color=guide_legend(' '),
    shape=guide_legend(' ')
  ) +
  theme(axis.text.y=element_text(size=11),
        legend.text = element_text(size=11),
        legend.position = 'top')



```

```{r}
#turn into a function
sdg_cor_function <- function(comparison_data) {
  #get correlations with pvalues
  index_cor_full <- comparison_data %>%
    select(c('SPI.INDEX', 'SCI', 'ODIN_score', 'odb', 'gdb'),sdgs) %>%
    rename(
      SPI=SPI.INDEX,
      SCI=SCI, 
      ODIN=ODIN_score, 
      ODB=odb, 
      GDB=gdb
    ) %>%
    as.matrix() %>%
    rcorr()
  
  #get just pearon correlation
  index_cor <- index_cor_full$r
  index_cor <- index_cor[6:23,1:5]
  
  
  #round
  index_cor <- round(index_cor,2)
  
  #color
  index_col <- index_cor
  index_col[abs(index_cor)>=0.5] <- "#fcca46"
  index_col[abs(index_cor)>=0.65] <- "#a1c181"
  index_col[abs(index_cor)>=0.8] <- "#619b8a"
  index_col[abs(index_cor)<0.5] <- "white"
  
  #get statistical significance
  index_sig <- index_cor_full$P
  index_sig <- index_sig[6:23,1:5]
  
  rownames(index_cor) <- sdg_names
  rownames(index_sig) <- sdg_names 
  
  fig <- as_tibble(cbind(index_cor, index_sig,var=sdg_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...6,0.05,0.1) ~   paste0(SPI...1,"*"),
            between(SPI...6,0.01,0.5) ~  paste0(SPI...1,"**"),
            between(SPI...6,0.0,0.01) ~ paste0(SPI...1,"***"),
           is.na(SPI...1) ~ "",
           TRUE ~      paste0(SPI...1,"")
            ),
         SCI=case_when(
           between(SCI...7,0.05,0.1) ~   paste0(SCI...2,"*"),
           between(SCI...7,0.01,0.5) ~  paste0(SCI...2,"**"),
           between(SCI...7,0.0,0.01) ~ paste0(SCI...2,"***"),
           is.na(SCI...2) ~ "",
           TRUE ~      paste0(SCI...2,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...9,0.05,0.1) ~   paste0(ODB...4,"*"),
           between(ODB...9,0.01,0.5) ~  paste0(ODB...4,"**"),
           between(ODB...9,0.0,0.01) ~ paste0(ODB...4,"***"),
           is.na(ODB...4) ~ "",
           TRUE ~      paste0(ODB...4,"")
         ),
         GDB=case_when(
           between(GDB...10,0.05,0.1) ~   paste0(GDB...5,"*"),
           between(GDB...10,0.01,0.5) ~  paste0(GDB...5,"**"),
           between(GDB...10,0.0,0.01) ~ paste0(GDB...5,"***"),
           is.na(GDB...5) ~ "",
           TRUE ~      paste0(GDB...5,"")
         )
    )  %>%
  mutate(var=factor(var, levels=rev(var)))

  ggplot(fig, aes(x=var, y=abs(SPI...1))) +
    geom_point(aes(color="SPI", shape="SPI", fill="SPI"), size=4) +
    geom_point(aes(y=abs(SCI...2),color="SCI", shape="SCI", fill="SCI"), size=4) +
    geom_point(aes(y=abs(ODIN...3),color="ODIN", shape="ODIN", fill="ODIN"), size=4) +
    geom_point(aes(y=abs(ODB...4),color="ODB", shape="ODB", fill="ODB"), size=4) +
    geom_point(aes(y=abs(GDB...5),color="GDB", shape="GDB", fill="GDB"), size=4) +
    coord_flip() +
    scale_color_manual(
      values=c(
        "SPI"="#264653",
        "SCI"="#2a9d8f",
        "ODIN" = "#e9c46a",
        "ODB" = "#f4a261",
        "GDB" = "#e76f51"
      )
    ) +
    scale_fill_manual(
      values=c(
        "SPI"="#264653",
        "SCI"="#2a9d8f",
        "ODIN" = "#e9c46a",
        "ODB" = "#f4a261",
        "GDB" = "#e76f51"
      )
    ) +
    scale_shape_manual(
      values=c(
        "SPI"=15,
        "SCI"=6,
        "ODIN" = 17,
        "ODB" = 18,
        "GDB" = 19
      )
    ) +
    xlab("") +
    ylab("") +
    scale_x_discrete(labels=scales::wrap_format(30)) +
    theme_spi() +
    labs(caption=str_wrap("Note: Latest year for the SPI is 2023, for the SCI is 2020, for the ODIN is 2022, for the ODB is 2017, and for the GDB is 2021.",100)) +
    guides(
      fill=guide_legend(' '),
      color=guide_legend(' '),
      shape=guide_legend(' ')
    ) +
    theme(axis.text.y=element_text(size=11),
          legend.text = element_text(size=11),
          legend.position = 'top')
}
```


### Figure 4. Absolute Value of Correlation between Key SDGs and Indexes - Low Income Countries


```{r}
#| label: fig4-lic
#| fig-height: 10
#| fig-width: 8


comparison_lic_df <- comparison_df %>%
  filter(income == "Low income")

#comparison_df  %>% filter(income!='Not classified') %>%ggplot(aes(x=env_perform_index, y=SPI.INDEX)) + facet_wrap(~income) + geom_text(aes(label=iso3c)) + geom_smooth(method='lm', se=FALSE) + theme_bw()

#comparison_df  %>% filter(income!='Not classified') %>%ggplot(aes(x=press_free_score, y=SPI.INDEX)) + facet_wrap(~income) + geom_text(aes(label=iso3c)) + geom_smooth(method='lm', se=FALSE) + theme_bw()

sdg_cor_function(comparison_lic_df)
```

### Figure 4. Absolute Value of Correlation between Key SDGs and Indexes - Lower Middle Income Countries

```{r}
#| label: fig4-lmic
#| fig-height: 10
#| fig-width: 8

comparison_lmic_df <- comparison_df %>%
  filter(income == "Lower middle income")

sdg_cor_function(comparison_lmic_df)
```

### Figure 4. Absolute Value of Correlation between Key SDGs and Indexes - Upper Middle Income Countries

```{r}
#| label: fig4-umic
#| fig-height: 10
#| fig-width: 8

comparison_umic_df <- comparison_df %>%
  filter(income == "Upper middle income")

sdg_cor_function(comparison_umic_df)

```

### Figure 4. Absolute Value of Correlation between Key SDGs and Indexes - High Income Countries


```{r}
#| label: fig4-hic
#| fig-height: 10
#| fig-width: 8

comparison_hic_df <- comparison_df %>%
  filter(income == "High income")

sdg_cor_function(comparison_hic_df)

```

### Figure 4. Absolute Value of Correlation between Key SDGs and Indexes - LIC & LMIC

```{r}
#| label: fig4-lic-lmic
#| fig-height: 10
#| fig-width: 8

comparison_lic_lmic_df <- comparison_df %>%
  filter(income %in% c("Low income", "Lower middle income"))

sdg_cor_function(comparison_lic_lmic_df)

```

### Figure 4. Absolute Value of Correlation between Key SDGs and Indexes - UMIC & HIC

```{r}
#| label: fig4-umic-hic
#| fig-height: 10
#| fig-width: 8

comparison_umic_hic_df <- comparison_df %>%
  filter(income %in% c("Upper middle income", "High income"))

sdg_cor_function(comparison_umic_hic_df)

```


## Figure 5. Absolute Value of Correlations between Key Development Indices

```{r}
#| label: devcordf


devindex <- c( 'eci_value','env_perform_index', 'better_life_index', 'undernourish', 'severe_food_insecurity', 'legatum_health_index', 'ghsindex', 'hdi_value','HD.HCI.OVRL',   'press_free_score')

devindex_names <- c('Economic Complexity Index','Environmental Performance Index','OECD Better Life Index', 'Prevalence of Undernourisment','Prevalence of Severe Food Insecurity', 'Legatum Prosperity Index (LPI) health sub-index score', 'Global Health Security Index (GHSI) overall score', "UN Human Development Index", 'WB Human Capital Index','World Press Freedom Index' )

#get correlations with pvalues
index_cor_full <- comparison_df %>%
  select(c('SPI.INDEX', 'SCI', 'ODIN_score', 'odb', 'gdb'),devindex) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_full$r
index_cor <- index_cor[6:15,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_full$P
index_sig <- index_sig[6:15,1:5]

rownames(index_cor) <- devindex_names
rownames(index_sig) <- devindex_names 
```

```{r}
#| label: fig5
#| fig-height: 7
#| fig-width: 8


fig <- as_tibble(cbind(index_cor, index_sig,var=devindex_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...6,0.05,0.1) ~   paste0(SPI...1,"*"),
            between(SPI...6,0.01,0.5) ~  paste0(SPI...1,"**"),
            between(SPI...6,0.0,0.01) ~ paste0(SPI...1,"***"),
           is.na(SPI...1) ~ "",
           TRUE ~      paste0(SPI...1,"")
            ),
         SCI=case_when(
           between(SCI...7,0.05,0.1) ~   paste0(SCI...2,"*"),
           between(SCI...7,0.01,0.5) ~  paste0(SCI...2,"**"),
           between(SCI...7,0.0,0.01) ~ paste0(SCI...2,"***"),
           is.na(SCI...2) ~ "",
           TRUE ~      paste0(SCI...2,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...9,0.05,0.1) ~   paste0(ODB...4,"*"),
           between(ODB...9,0.01,0.5) ~  paste0(ODB...4,"**"),
           between(ODB...9,0.0,0.01) ~ paste0(ODB...4,"***"),
           is.na(ODB...4) ~ "",
           TRUE ~      paste0(ODB...4,"")
         ),
         GDB=case_when(
           between(GDB...10,0.05,0.1) ~   paste0(GDB...5,"*"),
           between(GDB...10,0.01,0.5) ~  paste0(GDB...5,"**"),
           between(GDB...10,0.0,0.01) ~ paste0(GDB...5,"***"),
           is.na(GDB...5) ~ "",
           TRUE ~      paste0(GDB...5,"")
         )
    )  %>%
  mutate(var=factor(var, levels=rev(var)))

ggplot(fig, aes(x=var, y=abs(SPI...1))) +
  geom_point(aes(color="SPI", shape="SPI", fill="SPI"), size=4) +
  geom_point(aes(y=abs(SCI...2),color="SCI", shape="SCI", fill="SCI"), size=4) +
  geom_point(aes(y=abs(ODIN...3),color="ODIN", shape="ODIN", fill="ODIN"), size=4) +
  geom_point(aes(y=abs(ODB...4),color="ODB", shape="ODB", fill="ODB"), size=4) +
  geom_point(aes(y=abs(GDB...5),color="GDB", shape="GDB", fill="GDB"), size=4) +
  coord_flip() +
  scale_color_manual(
    values=c(
      "SPI"="#264653",
      "SCI"="#2a9d8f",
      "ODIN" = "#e9c46a",
      "ODB" = "#f4a261",
      "GDB" = "#e76f51"
    )
  ) +
  scale_fill_manual(
    values=c(
      "SPI"="#264653",
      "SCI"="#2a9d8f",
      "ODIN" = "#e9c46a",
      "ODB" = "#f4a261",
      "GDB" = "#e76f51"
    )
  ) +
  scale_shape_manual(
    values=c(
      "SPI"=15,
      "SCI"=6,
      "ODIN" = 17,
      "ODB" = 18,
      "GDB" = 19
    )
  ) +
  xlab("") +
  ylab("") +
  scale_x_discrete(labels=scales::wrap_format(30)) +
  theme_spi() +
  labs(caption=str_wrap("Note: Latest year for the SPI is 2023, for the SCI is 2020, for the ODIN is 2022, for the ODB is 2017, and for the GDB is 2021.",100)) +
  guides(
    fill=guide_legend(' '),
    color=guide_legend(' '),
    shape=guide_legend(' ')
  ) +
  theme(axis.text.y=element_text(size=11),
        legend.text = element_text(size=11),
        legend.position = 'top')
```

### Figure 5. Absolute Value of Correlations between Key Development Indices - Low Income Countries

```{r}
plot_correlations <- function(comparison_data) {

  
  # Get correlations with p-values
  index_cor_full <- comparison_data %>%
    select(c('SPI.INDEX', 'SCI', 'ODIN_score', 'odb', 'gdb'), devindex) %>%
    rename(
      SPI = SPI.INDEX,
      SCI = SCI, 
      ODIN = ODIN_score, 
      ODB = odb, 
      GDB = gdb
    ) %>%
    as.matrix() %>%
    rcorr()
  
  # Get just Pearson correlation
  index_cor <- index_cor_full$r
  index_cor <- index_cor[6:15, 1:5]
  
  # Round
  index_cor <- round(index_cor, 2)
  
  # Color
  index_col <- index_cor
  index_col[abs(index_cor) >= 0.5] <- "#fcca46"
  index_col[abs(index_cor) >= 0.65] <- "#a1c181"
  index_col[abs(index_cor) >= 0.8] <- "#619b8a"
  index_col[abs(index_cor) < 0.5] <- "white"
  
  # Get statistical significance
  index_sig <- index_cor_full$P
  index_sig <- index_sig[6:15, 1:5]
  
  rownames(index_cor) <- devindex_names
  rownames(index_sig) <- devindex_names 
  
  # Create the plot
  fig <- as_tibble(cbind(index_cor, index_sig, var = devindex_names), .name_repair = "universal") %>%
    mutate(across(c(1:10), as.numeric)) %>%
    mutate(SPI = case_when(
      between(SPI...6, 0.05, 0.1) ~ paste0(SPI...1, "*"),
      between(SPI...6, 0.01, 0.5) ~ paste0(SPI...1, "**"),
      between(SPI...6, 0.0, 0.01) ~ paste0(SPI...1, "***"),
      is.na(SPI...1) ~ "",
      TRUE ~ paste0(SPI...1, "")
    ),
    SCI = case_when(
      between(SCI...7, 0.05, 0.1) ~ paste0(SCI...2, "*"),
      between(SCI...7, 0.01, 0.5) ~ paste0(SCI...2, "**"),
      between(SCI...7, 0.0, 0.01) ~ paste0(SCI...2, "***"),
      is.na(SCI...2) ~ "",
      TRUE ~ paste0(SCI...2, "")
    ),
    ODIN = case_when(
      between(ODIN...8, 0.05, 0.1) ~ paste0(ODIN...3, "*"),
      between(ODIN...8, 0.01, 0.5) ~ paste0(ODIN...3, "**"),
      between(ODIN...8, 0.0, 0.01) ~ paste0(ODIN...3, "***"),
      is.na(ODIN...3) ~ "",
      TRUE ~ paste0(ODIN...3, "")
    ),
    ODB = case_when(
      between(ODB...9, 0.05, 0.1) ~ paste0(ODB...4, "*"),
      between(ODB...9, 0.01, 0.5) ~ paste0(ODB...4, "**"),
      between(ODB...9, 0.0, 0.01) ~ paste0(ODB...4, "***"),
      is.na(ODB...4) ~ "",
      TRUE ~ paste0(ODB...4, "")
    ),
    GDB = case_when(
      between(GDB...10, 0.05, 0.1) ~ paste0(GDB...5, "*"),
      between(GDB...10, 0.01, 0.5) ~ paste0(GDB...5, "**"),
      between(GDB...10, 0.0, 0.01) ~ paste0(GDB...5, "***"),
      is.na(GDB...5) ~ "",
      TRUE ~ paste0(GDB...5, "")
    )
    ) %>%
    mutate(var = factor(var, levels = rev(var)))
  
  ggplot(fig, aes(x = var, y = abs(SPI...1))) +
    geom_point(aes(color = "SPI", shape = "SPI", fill = "SPI"), size = 4) +
    geom_point(aes(y = abs(SCI...2), color = "SCI", shape = "SCI", fill = "SCI"), size = 4) +
    geom_point(aes(y = abs(ODIN...3), color = "ODIN", shape = "ODIN", fill = "ODIN"), size = 4) +
    geom_point(aes(y = abs(ODB...4), color = "ODB", shape = "ODB", fill = "ODB"), size = 4) +
    geom_point(aes(y = abs(GDB...5), color = "GDB", shape = "GDB", fill = "GDB"), size = 4) +
    coord_flip() +
    scale_color_manual(
      values = c(
        "SPI" = "#264653",
        "SCI" = "#2a9d8f",
        "ODIN" = "#e9c46a",
        "ODB" = "#f4a261",
        "GDB" = "#e76f51"
      )
    ) +
    scale_fill_manual(
      values = c(
        "SPI" = "#264653",
        "SCI" = "#2a9d8f",
        "ODIN" = "#e9c46a",
        "ODB" = "#f4a261",
        "GDB" = "#e76f51"
      )
    ) +
    scale_shape_manual(
      values = c(
        "SPI" = 15,
        "SCI" = 6,
        "ODIN" = 17,
        "ODB" = 18,
        "GDB" = 19
      )
    ) +
    xlab("") +
    ylab("") +
    scale_x_discrete(labels = scales::wrap_format(30)) +
    theme_spi() +
   labs(caption=str_wrap("Note: Latest year for the SPI is 2023, for the SCI is 2020, for the ODIN is 2022, for the ODB is 2017, and for the GDB is 2021.",100)) +
    guides(
      fill = guide_legend(' '),
      color = guide_legend(' '),
      shape = guide_legend(' ')
    ) +
    theme(axis.text.y = element_text(size = 11),
          legend.text = element_text(size = 11),
          legend.position = 'top')
}

# Example usage:
# plot_correlations(comparison_df)
```

```{r}
#| label: fig5-lic
#| fig-height: 6
#| fig-width: 8


comparison_lic_df <- comparison_df %>%
  filter(income == "Low income")

#comparison_df  %>% filter(income!='Not classified') %>%ggplot(aes(x=env_perform_index, y=SPI.INDEX)) + facet_wrap(~income) + geom_text(aes(label=iso3c)) + geom_smooth(method='lm', se=FALSE) + theme_bw()

#comparison_df  %>% filter(income!='Not classified') %>%ggplot(aes(x=press_free_score, y=SPI.INDEX)) + facet_wrap(~income) + geom_text(aes(label=iso3c)) + geom_smooth(method='lm', se=FALSE) + theme_bw()

plot_correlations(comparison_lic_df)
```
### Figure 5. Absolute Value of Correlations between Key Development Indices - Lower Middle Income Countries

```{r}
#| label: fig5-lmic
#| fig-height: 6
#| fig-width: 8

comparison_lmic_df <- comparison_df %>%
  filter(income == "Lower middle income")

plot_correlations(comparison_lmic_df)
```

### Figure 5. Absolute Value of Correlations between Key Development Indices - Upper Middle Income Countries

```{r}
#| label: fig5-umic
#| fig-height: 6
#| fig-width: 8

comparison_umic_df <- comparison_df %>%
  filter(income == "Upper middle income")

plot_correlations(comparison_umic_df)

```

### Figure 5. Absolute Value of Correlations between Key Development Indices - High Income Countries


```{r}
#| label: fig5-hic
#| fig-height: 6
#| fig-width: 8

comparison_hic_df <- comparison_df %>%
  filter(income == "High income")

plot_correlations(comparison_hic_df)

```

### Figure 5. Absolute Value of Correlations between Key Development Indices - LIC & LMIC

```{r}
#| label: fig5-lic-lmic
#| fig-height: 6
#| fig-width: 8

comparison_lic_lmic_df <- comparison_df %>%
  filter(income %in% c("Low income", "Lower middle income"))

plot_correlations(comparison_lic_lmic_df)

```

### Figure 5. Absolute Value of Correlations between Key Development Indices - UMIC & HIC

```{r}
#| label: fig5-umic-hic
#| fig-height: 6
#| fig-width: 8

comparison_umic_hic_df <- comparison_df %>%
  filter(income %in% c("Upper middle income", "High income"))

plot_correlations(comparison_umic_hic_df)

```


## Figure 6. Theory of Change

Figure produced using Microsoft Powerpoint.

![](./other_figures/SPI_theory_change.png)

## Figure 7. The Pillars and Dimensions that Construct the New SPI

Figure produced using Microsoft Powerpoint. Figure reproduced below.

![](./other_figures/SPI_dashboard.png)

## Figure 8. SPI Overall Score and Pillar Scores in 2023

### SPI Overall Score
```{r}
#| label: fig8a
#| fig-height: 6
#| fig-width: 9

spi_mapper('spi_index_df', 'SPI.INDEX', "SPI Overall Score - 2023")
```

### SPI Data Use Score (Pillar 1)

```{r}
#| label: fig8b
#| fig-height: 6
#| fig-width: 9

spi_mapper('spi_index_df', 'SPI.INDEX.PIL1', "SPI Data Use Score - 2023")
```

### SPI Data Services Score (Pillar 2)

```{r}
#| label: fig8c
#| fig-height: 6
#| fig-width: 9

spi_mapper('spi_index_df', 'SPI.INDEX.PIL2', "SPI Data Services Score - 2023")
```

### SPI Data Products Score (Pillar 3)

```{r}
#| label: fig8d
#| fig-height: 6
#| fig-width: 9

spi_mapper('spi_index_df', 'SPI.INDEX.PIL3', "SPI Data Products Score - 2023")

```

### SPI Data Sources Score (Pillar 4)

```{r}
#| label: fig8e
#| fig-height: 6
#| fig-width: 9

spi_mapper('spi_index_df', 'SPI.INDEX.PIL4', "SPI Data Sources Score - 2023")
```

### SPI Data Infrastructure Score (Pillar 5)

```{r}
#| label: fig8f
#| fig-height: 6
#| fig-width: 9

spi_mapper('spi_index_df', 'SPI.INDEX.PIL5', "SPI Data Infrastructure Score - 2023")

```

## Table S1. An Overview of the World Bank's Statistical Capacity Index (SCI) in Selected Recent Studies

Table produced using Microsoft Word. Articles gathered through a literature review.

## Table S2. Description of SPI Dimensions

Table produced using Microsoft Word, based on SPI metadata.

## Table S3. Mapping of SPI Indicators to SDG Indicators

Table produced using Microsoft Word, based on SPI metadata and UN SDG Indicator metadata.

## Table S4. SPI overall score and Pillar Scores in 2023

Below, the full list of countries by their SPI overall score in 2023 is presented. The first column is the country name and the following columns are the overall SPI overall score, and then the sub-scores for pillars 1, 2, 3, 4, and 5. The purpose of the SPI is to help countries assess and improve the performance of their statistical systems. The presentation of SPI overall scores is designed to reflect that aim. Small differences between countries should not be stressed since they can reflect imprecision arising from the currently available indicators rather than meaningful differences in performance. Instead, the presentation of overall SPI scores focuses on larger groupings of countries reflecting broad categories of performance as measured by the indicator framework. In total there are 186 countries with sufficient data to compute an index value. This set of countries covers 99.3 percent of the world population. Countries shaded in dark orange are the lowest performing, countries in dark green are the highest performing. Countries are grouped into five groups:

-   **Top Quintile**: Countries in the Top quintile are classified in this group. Shading in [dark green]{style="color:#2ec4b6"}.\
-   **4th Quintile**: Countries in the 4th quintile, or those above the 60th percentile but below the 80th percentile are in this group. Shading in [light green]{style="color:#acece7"}.\
-   **3rd Quintile**: Countries in the 3rd quintile, or those between the 40th and 60th percentile, are classified in this group. Shading in [yellow]{style="color:#f1dc76"}.\
-   **2nd Quintile**: Countries in the 2nd quintile, or those above the 20th percentile but below the 40th percentile, are in this group. Shading in [light orange]{style="color:#ffbf69"}.\
-   **Bottom 20%**: Countries in the bottom 20% are classified in this group. Shading in [dark orange ]{style="color:#ff9f1c"}.

```{r}
#arrange table and format
index_disp <- spi_index_df %>%
  ungroup() %>%
  filter(date==end_date) %>%
  arrange(-SPI.INDEX) %>%
  mutate(across(starts_with('SPI.INDEX'),~1*.),
         across(starts_with('SPI.INDEX'),round,1)) %>%
  select(country, iso3c, date, starts_with('SPI.INDEX'))

#colors

col_palette <- c("#2ec4b6", "#acece7", "#f1dc76",  "#ffbf69","#ff9f1c"   )

col_palette2 <- c("#2ec4b6",  "#f1dc76", "#ff9f1c" )

#make the table
index_tab <- index_disp %>%
  filter(date==end_date) %>%
  select(country, SPI.INDEX,SPI.INDEX.PIL1,SPI.INDEX.PIL2,SPI.INDEX.PIL3,SPI.INDEX.PIL4,SPI.INDEX.PIL5) 

 #calculate the breaks for the color coding
        brks <- quantile(index_tab$SPI.INDEX, probs=c(1,2,3,4)/5,na.rm=T)
        brks <- append(0,brks)
        brks <- append(brks,100)

        brks1 <- quantile(index_tab$SPI.INDEX.PIL1, probs=c(1,2,3,4)/5,na.rm=T)
        brks1 <- append(0,brks1)
        if (max(brks1)<100) brks1 <- append(brks1,100)
        
        brks2 <- quantile(index_tab$SPI.INDEX.PIL2, probs=c(1,2,3,4)/5,na.rm=T)
        brks2 <- append(0,brks2)
        if (max(brks2)<100) brks2 <- append(brks2,100)
        
        brks3 <- quantile(index_tab$SPI.INDEX.PIL3, probs=c(1,2,3,4)/5,na.rm=T)
        brks3 <- append(0,brks3)
        if (max(brks3)<100) brks3 <- append(brks3,100)
        
        brks4 <- quantile(index_tab$SPI.INDEX.PIL4, probs=c(1,2,3,4)/5,na.rm=T)
        brks4 <- append(0,brks4)
        if (max(brks4)<100) brks4 <- append(brks4,100)
        
        brks5 <- quantile(index_tab$SPI.INDEX.PIL5, probs=c(1,2,3,4)/5,na.rm=T)
        brks5 <- append(0,brks5)
        if (max(brks5)<100) brks5 <- append(brks5,100)
        

      #make nice looking
      index_tab <- index_tab %>%
        flextable() %>%
        # add_header_lines('SPI overall score in end_date and Pillar Scores.') %>%
        set_header_labels(values=list(
                             country="Country",
                             SPI.INDEX="SPI overall score",
                             SPI.INDEX.PIL1="Pillar 1: Data Use",
                             SPI.INDEX.PIL2="Pillar 2: Data Services",
                             SPI.INDEX.PIL3="Pillar 3: Data Products ",
                             SPI.INDEX.PIL4="Pillar 4: Data Sources",
                             SPI.INDEX.PIL5="Pillar 5: Data Infrastructure"
                                         )) %>%
          bg(j = c('SPI.INDEX'),
             bg = scales::col_bin(col_palette, domain=c(0,100), bins=brks, reverse=TRUE)) %>%
          bg(j = c('SPI.INDEX.PIL1'),
             bg = scales::col_bin(col_palette, domain=c(0,100), bins=brks1, reverse=TRUE)) %>%
          bg(j = c('SPI.INDEX.PIL2'),
             bg = scales::col_bin(col_palette, domain=c(0,100), bins=brks2, reverse=TRUE)) %>%
          bg(j = c('SPI.INDEX.PIL3'),
             bg = scales::col_bin(col_palette, domain=c(0,100), bins=brks3, reverse=TRUE)) %>%
          bg(j = c('SPI.INDEX.PIL4'),
             bg = scales::col_bin(col_palette, domain=c(0,100), bins=brks4, reverse=TRUE)) %>%
          bg(j = c('SPI.INDEX.PIL5'),
             bg = scales::col_bin(col_palette, domain=c(0,100), bins=brks5, reverse=TRUE))


FitFlextableToPage(index_tab)

```

## Table S5. Comparison of SPI to Other Statistical and Development Indices

Table produced using Microsoft Word and a review of other indices.


## Table S6. Bivariate Correlation between Statistical Indexes and Key Development Outcomes

```{r}
#get correlations with pvalues
index_cor_full <- comparison_df %>%
  select(c('gdb','odb','ODIN_score', 'SCI', 'SPI.INDEX' ),sdgs) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_full$r
index_cor <- index_cor[6:23,1:5]

#get correlation between indices
index_between_cor <- index_cor_full$r
index_between_cor <- index_between_cor[1:5,1:5]

#get Ns
index_n <- index_cor_full$n
index_n <- index_n[6:23,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_full$P
index_sig <- index_sig[6:23,1:5]

rownames(index_cor) <- sdg_names
rownames(index_sig) <- sdg_names 


tab <- as_tibble(cbind(index_cor, index_sig,var=sdg_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...10,0.05,0.1) ~   paste0(SPI...5,"*"),
            between(SPI...10,0.01,0.5) ~  paste0(SPI...5,"**"),
            between(SPI...10,0.0,0.01) ~ paste0(SPI...5,"***"),
           is.na(SPI...5) ~ "",
           TRUE ~      paste0(SPI...5,"")
            ),
         SCI=case_when(
           between(SCI...9,0.05,0.1) ~   paste0(SCI...4,"*"),
           between(SCI...9,0.01,0.5) ~  paste0(SCI...4,"**"),
           between(SCI...9,0.0,0.01) ~ paste0(SCI...4,"***"),
           is.na(SCI...4) ~ "",
           TRUE ~      paste0(SCI...4,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...7,0.05,0.1) ~   paste0(ODB...2,"*"),
           between(ODB...7,0.01,0.5) ~  paste0(ODB...2,"**"),
           between(ODB...7,0.0,0.01) ~ paste0(ODB...2,"***"),
           is.na(ODB...2) ~ "",
           TRUE ~      paste0(ODB...2,"")
         ),
         GDB=case_when(
           between(GDB...6,0.05,0.1) ~   paste0(GDB...1,"*"),
           between(GDB...6,0.01,0.5) ~  paste0(GDB...1,"**"),
           between(GDB...6,0.0,0.01) ~ paste0(GDB...1,"***"),
           is.na(GDB...1) ~ "",
           TRUE ~      paste0(GDB...1,"")
         )
    ) %>% select( var,GDB, ODB, ODIN, SCI, SPI ) 

# Create a correlation matrix
cor_matrix <- index_cor

# Function to calculate p-value from z-score
p_value_from_z <- function(z) {
  2 * (1 - pnorm(abs(z)))
}

# Iterate through the correlation matrix
n <- nrow(cor_matrix)
c <- ncol(cor_matrix)
p_values <- matrix(NA, n, c*c)  # Store p-values for each pairwise comparison

for (i in 1:n) {
  for (j in 1:c) {
    for (k in 1:c) {
      
      
      row1 <- i
      col1 <- j
      correlation_1 <- cor_matrix[row1,col1]
      
      row2 <- i
      col2 <- k
      correlation_2 <- cor_matrix[row2, col2]
      
      #between correlation
      correlation_between <- index_between_cor[col1,col2]
      
      # Step 3: Calculate standard errors of z-transformed correlations
      n_1 <- index_n[row1,col1]  # Sample size for correlation_1
      n_2 <- index_n[row2,col2]  # Sample size for correlation_2
      n_min <- min(n_1,n_2)

      
      # Step 4: Perform z-test for the difference between z-scores. pearson1898: Pearson and Filon’s [7] z
      kfactor = correlation_between*(1-correlation_1^2-correlation_2^2)-0.5*(correlation_1*correlation_2)*(1-correlation_1^2-correlation_2^2-correlation_between^2)
      z_diff <- sqrt(n_min)*(correlation_1-correlation_2)/sqrt((1-correlation_1^2)^2+(1-correlation_2^2)^2-2*kfactor)
      
      if (is.na(z_diff)) {
        z_diff=0
      }
      # Calculate p-value
      pos <- (j-1)*5+k
      pval <- p_value_from_z(z_diff)
      
      if (k==1 & pval > 0.05)  {
        txt <- "GDB"
      } else if (k==2 & pval > 0.05) {
        txt <- "ODB"
      } else if (k==3  & pval > 0.05) {
        txt <- "ODIN"
      } else if (k==4  & pval > 0.05) {
        txt <- "SCI"
      } else if (k==5  & pval > 0.05) {
        txt <- "SPI"
      } else {
        txt <- ""
      }
      p_values[i, pos] <- txt
    }
  }
}



#format as table
pval_df <- as.data.frame(p_values) %>%
  cbind(SDG=sdg_names) %>%
  select(-V1, -V7, -V13, -V19, -V25) %>%
  select(SDG, everything()) 
```

```{r}

## COmbined table

text_tab <- pval_df %>% 
  mutate(rownum=row_number(), font_small=TRUE) %>%
  mutate(GDB_text=paste(V2, V3, V4, V5, sep=", "),
         ODB_text=paste(V6, V8, V9, V10, sep=", "),
         ODIN_text=paste(V11, V12, V14, V15, sep=", "),
         SCI_text=paste(V16, V17, V18, V20, sep=", "),
         SPI_text=paste(V21, V22, V23, V24, sep=", ")) %>%
  mutate(across(ends_with('_text'), ~gsub(", ,","",.))) %>%
  select(SDG, ends_with('_text'))

comb_tab <- tab %>% 
  rename(SDG=var) %>%
  mutate(rownum=row_number()) %>%
  left_join(text_tab) %>%
  arrange(rownum) %>%
  select(SDG, starts_with("GDB"), starts_with("ODB"),
         starts_with("ODIN"), starts_with("SCI"), starts_with("SPI"))


flextable(comb_tab) %>%
  set_header_labels(
    SDG="SDG",
    GDB="GDB", GDB_text="GDB",
    ODB="ODB", ODB_text="ODB",
    ODIN="ODIN", ODIN_text="ODIN",
    SCI="SCI", SCI_text="SCI",
    SPI="SPI", SPI_text="SPI"
  ) %>%
  
  merge_at(i=1,j=2:3, part = 'header') %>%
  merge_at(i=1,j=4:5, part = 'header') %>%
  merge_at(i=1,j=6:7, part = 'header') %>%
  merge_at(i=1,j=8:9, part = 'header') %>%
  merge_at(i=1,j=10:11, part = 'header') %>% 
  
  fontsize(j = c(3,5,7,9,11), size = 6, part = "body") %>%
  fontsize(j = c(1,2,4,6,8,10), size = 8, part = "body") %>%
  
  vline(j=1) %>%
  vline(j=3) %>%
  vline(j=5) %>%
  vline(j=7) %>%
  vline(j=9) %>%
  align(i=1, part="header", align="center") %>%
  autofit()

## Kseniya`s inputs 
write_xlsx(comb_tab, path = paste0(output_dir, "/results_S6.xlsx"))

```

Note: Latest year for the SPI is 2023, for the SCI is 2020, for the ODIN is 2022, for the ODB is 2017, and for the GDB is 2021.

## Table S6.1 Bivariate Correlation between Statistical Indexes and Key Development Outcomes

```{r}

#get correlations with pvalues
index_cor_lic_lmic <- comparison_df %>%
  filter(income %in% c("Low income", "Lower middle income")) %>% 
  select(c('gdb','odb','ODIN_score', 'SCI', 'SPI.INDEX' ),sdgs) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_lic_lmic$r
index_cor <- index_cor[6:23,1:5]

#get correlation between indices
index_between_cor <- index_cor_lic_lmic$r
index_between_cor <- index_between_cor[1:5,1:5]

#get Ns
index_n <- index_cor_lic_lmic$n
index_n <- index_n[6:23,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_lic_lmic$P
index_sig <- index_sig[6:23,1:5]

rownames(index_cor) <- sdg_names
rownames(index_sig) <- sdg_names 


tab <- as_tibble(cbind(index_cor, index_sig,var=sdg_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...10,0.05,0.1) ~   paste0(SPI...5,"*"),
            between(SPI...10,0.01,0.5) ~  paste0(SPI...5,"**"),
            between(SPI...10,0.0,0.01) ~ paste0(SPI...5,"***"),
           is.na(SPI...5) ~ "",
           TRUE ~      paste0(SPI...5,"")
            ),
         SCI=case_when(
           between(SCI...9,0.05,0.1) ~   paste0(SCI...4,"*"),
           between(SCI...9,0.01,0.5) ~  paste0(SCI...4,"**"),
           between(SCI...9,0.0,0.01) ~ paste0(SCI...4,"***"),
           is.na(SCI...4) ~ "",
           TRUE ~      paste0(SCI...4,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...7,0.05,0.1) ~   paste0(ODB...2,"*"),
           between(ODB...7,0.01,0.5) ~  paste0(ODB...2,"**"),
           between(ODB...7,0.0,0.01) ~ paste0(ODB...2,"***"),
           is.na(ODB...2) ~ "",
           TRUE ~      paste0(ODB...2,"")
         ),
         GDB=case_when(
           between(GDB...6,0.05,0.1) ~   paste0(GDB...1,"*"),
           between(GDB...6,0.01,0.5) ~  paste0(GDB...1,"**"),
           between(GDB...6,0.0,0.01) ~ paste0(GDB...1,"***"),
           is.na(GDB...1) ~ "",
           TRUE ~      paste0(GDB...1,"")
         )
    ) %>% select( var,GDB, ODB, ODIN, SCI, SPI ) 

# Create a correlation matrix
cor_matrix <- index_cor

# Function to calculate p-value from z-score
p_value_from_z <- function(z) {
  2 * (1 - pnorm(abs(z)))
}

# Iterate through the correlation matrix
n <- nrow(cor_matrix)
c <- ncol(cor_matrix)
p_values <- matrix(NA, n, c*c)  # Store p-values for each pairwise comparison

for (i in 1:n) {
  for (j in 1:c) {
    for (k in 1:c) {
      
      
      row1 <- i
      col1 <- j
      correlation_1 <- cor_matrix[row1,col1]
      
      row2 <- i
      col2 <- k
      correlation_2 <- cor_matrix[row2, col2]
      
      #between correlation
      correlation_between <- index_between_cor[col1,col2]
      
      # Step 3: Calculate standard errors of z-transformed correlations
      n_1 <- index_n[row1,col1]  # Sample size for correlation_1
      n_2 <- index_n[row2,col2]  # Sample size for correlation_2
      n_min <- min(n_1,n_2)

      
      # Step 4: Perform z-test for the difference between z-scores. pearson1898: Pearson and Filon’s [7] z
      kfactor = correlation_between*(1-correlation_1^2-correlation_2^2)-0.5*(correlation_1*correlation_2)*(1-correlation_1^2-correlation_2^2-correlation_between^2)
      z_diff <- sqrt(n_min)*(correlation_1-correlation_2)/sqrt((1-correlation_1^2)^2+(1-correlation_2^2)^2-2*kfactor)
      
      if (is.na(z_diff)) {
        z_diff=0
      }
      # Calculate p-value
      pos <- (j-1)*5+k
      pval <- p_value_from_z(z_diff)
      
      if (k==1 & pval > 0.05)  {
        txt <- "GDB"
      } else if (k==2 & pval > 0.05) {
        txt <- "ODB"
      } else if (k==3  & pval > 0.05) {
        txt <- "ODIN"
      } else if (k==4  & pval > 0.05) {
        txt <- "SCI"
      } else if (k==5  & pval > 0.05) {
        txt <- "SPI"
      } else {
        txt <- ""
      }
      p_values[i, pos] <- txt
    }
  }
}



#format as table
pval_df <- as.data.frame(p_values) %>%
  cbind(SDG=sdg_names) %>%
  select(-V1, -V7, -V13, -V19, -V25) %>%
  select(SDG, everything()) 
```


```{r}

## COmbined table

text_tab <- pval_df %>% 
  mutate(rownum=row_number(), font_small=TRUE) %>%
  mutate(GDB_text=paste(V2, V3, V4, V5, sep=", "),
         ODB_text=paste(V6, V8, V9, V10, sep=", "),
         ODIN_text=paste(V11, V12, V14, V15, sep=", "),
         SCI_text=paste(V16, V17, V18, V20, sep=", "),
         SPI_text=paste(V21, V22, V23, V24, sep=", ")) %>%
  mutate(across(ends_with('_text'), ~gsub(", ,","",.))) %>%
  select(SDG, ends_with('_text'))

comb_tab <- tab %>% 
  rename(SDG=var) %>%
  mutate(rownum=row_number()) %>%
  left_join(text_tab) %>%
  arrange(rownum) %>%
  select(SDG, starts_with("GDB"), starts_with("ODB"),
         starts_with("ODIN"), starts_with("SCI"), starts_with("SPI"))


flextable(comb_tab) %>%
  set_header_labels(
    SDG="SDG",
    GDB="GDB", GDB_text="GDB",
    ODB="ODB", ODB_text="ODB",
    ODIN="ODIN", ODIN_text="ODIN",
    SCI="SCI", SCI_text="SCI",
    SPI="SPI", SPI_text="SPI"
  ) %>%
  
  merge_at(i=1,j=2:3, part = 'header') %>%
  merge_at(i=1,j=4:5, part = 'header') %>%
  merge_at(i=1,j=6:7, part = 'header') %>%
  merge_at(i=1,j=8:9, part = 'header') %>%
  merge_at(i=1,j=10:11, part = 'header') %>% 
  
  fontsize(j = c(3,5,7,9,11), size = 6, part = "body") %>%
  fontsize(j = c(1,2,4,6,8,10), size = 8, part = "body") %>%
  
  vline(j=1) %>%
  vline(j=3) %>%
  vline(j=5) %>%
  vline(j=7) %>%
  vline(j=9) %>%
  align(i=1, part="header", align="center") %>%
  autofit()

## Kseniya`s inputs 
write_xlsx(comb_tab, path = paste0(output_dir, "/results_S6_1.xlsx"))

```


## Table S6.2 Bivariate Correlation between Statistical Indexes and Key Development Outcomes

```{r}

#get correlations with pvalues
index_cor_umic_hic <- comparison_df %>%
  filter(income %in% c("Upper middle income", "High income")) %>% 
  select(c('gdb','odb','ODIN_score', 'SCI', 'SPI.INDEX' ),sdgs) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_umic_hic$r
index_cor <- index_cor[6:23,1:5]

#get correlation between indices
index_between_cor <- index_cor_umic_hic$r
index_between_cor <- index_between_cor[1:5,1:5]

#get Ns
index_n <- index_cor_umic_hic$n
index_n <- index_n[6:23,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_umic_hic$P
index_sig <- index_sig[6:23,1:5]

rownames(index_cor) <- sdg_names
rownames(index_sig) <- sdg_names 


tab <- as_tibble(cbind(index_cor, index_sig,var=sdg_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...10,0.05,0.1) ~   paste0(SPI...5,"*"),
            between(SPI...10,0.01,0.5) ~  paste0(SPI...5,"**"),
            between(SPI...10,0.0,0.01) ~ paste0(SPI...5,"***"),
           is.na(SPI...5) ~ "",
           TRUE ~      paste0(SPI...5,"")
            ),
         SCI=case_when(
           between(SCI...9,0.05,0.1) ~   paste0(SCI...4,"*"),
           between(SCI...9,0.01,0.5) ~  paste0(SCI...4,"**"),
           between(SCI...9,0.0,0.01) ~ paste0(SCI...4,"***"),
           is.na(SCI...4) ~ "",
           TRUE ~      paste0(SCI...4,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...7,0.05,0.1) ~   paste0(ODB...2,"*"),
           between(ODB...7,0.01,0.5) ~  paste0(ODB...2,"**"),
           between(ODB...7,0.0,0.01) ~ paste0(ODB...2,"***"),
           is.na(ODB...2) ~ "",
           TRUE ~      paste0(ODB...2,"")
         ),
         GDB=case_when(
           between(GDB...6,0.05,0.1) ~   paste0(GDB...1,"*"),
           between(GDB...6,0.01,0.5) ~  paste0(GDB...1,"**"),
           between(GDB...6,0.0,0.01) ~ paste0(GDB...1,"***"),
           is.na(GDB...1) ~ "",
           TRUE ~      paste0(GDB...1,"")
         )
    ) %>% select( var,GDB, ODB, ODIN, SCI, SPI ) 

# Create a correlation matrix
cor_matrix <- index_cor

# Function to calculate p-value from z-score
p_value_from_z <- function(z) {
  2 * (1 - pnorm(abs(z)))
}

# Iterate through the correlation matrix
n <- nrow(cor_matrix)
c <- ncol(cor_matrix)
p_values <- matrix(NA, n, c*c)  # Store p-values for each pairwise comparison

for (i in 1:n) {
  for (j in 1:c) {
    for (k in 1:c) {
      
      
      row1 <- i
      col1 <- j
      correlation_1 <- cor_matrix[row1,col1]
      
      row2 <- i
      col2 <- k
      correlation_2 <- cor_matrix[row2, col2]
      
      #between correlation
      correlation_between <- index_between_cor[col1,col2]
      
      # Step 3: Calculate standard errors of z-transformed correlations
      n_1 <- index_n[row1,col1]  # Sample size for correlation_1
      n_2 <- index_n[row2,col2]  # Sample size for correlation_2
      n_min <- min(n_1,n_2)

      
      # Step 4: Perform z-test for the difference between z-scores. pearson1898: Pearson and Filon’s [7] z
      kfactor = correlation_between*(1-correlation_1^2-correlation_2^2)-0.5*(correlation_1*correlation_2)*(1-correlation_1^2-correlation_2^2-correlation_between^2)
      z_diff <- sqrt(n_min)*(correlation_1-correlation_2)/sqrt((1-correlation_1^2)^2+(1-correlation_2^2)^2-2*kfactor)
      
      if (is.na(z_diff)) {
        z_diff=0
      }
      # Calculate p-value
      pos <- (j-1)*5+k
      pval <- p_value_from_z(z_diff)
      
      if (k==1 & pval > 0.05)  {
        txt <- "GDB"
      } else if (k==2 & pval > 0.05) {
        txt <- "ODB"
      } else if (k==3  & pval > 0.05) {
        txt <- "ODIN"
      } else if (k==4  & pval > 0.05) {
        txt <- "SCI"
      } else if (k==5  & pval > 0.05) {
        txt <- "SPI"
      } else {
        txt <- ""
      }
      p_values[i, pos] <- txt
    }
  }
}



#format as table
pval_df <- as.data.frame(p_values) %>%
  cbind(SDG=sdg_names) %>%
  select(-V1, -V7, -V13, -V19, -V25) %>%
  select(SDG, everything()) 
```

```{r}

## COmbined table

text_tab <- pval_df %>% 
  mutate(rownum=row_number(), font_small=TRUE) %>%
  mutate(GDB_text=paste(V2, V3, V4, V5, sep=", "),
         ODB_text=paste(V6, V8, V9, V10, sep=", "),
         ODIN_text=paste(V11, V12, V14, V15, sep=", "),
         SCI_text=paste(V16, V17, V18, V20, sep=", "),
         SPI_text=paste(V21, V22, V23, V24, sep=", ")) %>%
  mutate(across(ends_with('_text'), ~gsub(", ,","",.))) %>%
  select(SDG, ends_with('_text'))

comb_tab <- tab %>% 
  rename(SDG=var) %>%
  mutate(rownum=row_number()) %>%
  left_join(text_tab) %>%
  arrange(rownum) %>%
  select(SDG, starts_with("GDB"), starts_with("ODB"),
         starts_with("ODIN"), starts_with("SCI"), starts_with("SPI"))


flextable(comb_tab) %>%
  set_header_labels(
    SDG="SDG",
    GDB="GDB", GDB_text="GDB",
    ODB="ODB", ODB_text="ODB",
    ODIN="ODIN", ODIN_text="ODIN",
    SCI="SCI", SCI_text="SCI",
    SPI="SPI", SPI_text="SPI"
  ) %>%
  
  merge_at(i=1,j=2:3, part = 'header') %>%
  merge_at(i=1,j=4:5, part = 'header') %>%
  merge_at(i=1,j=6:7, part = 'header') %>%
  merge_at(i=1,j=8:9, part = 'header') %>%
  merge_at(i=1,j=10:11, part = 'header') %>% 
  
  fontsize(j = c(3,5,7,9,11), size = 6, part = "body") %>%
  fontsize(j = c(1,2,4,6,8,10), size = 8, part = "body") %>%
  
  vline(j=1) %>%
  vline(j=3) %>%
  vline(j=5) %>%
  vline(j=7) %>%
  vline(j=9) %>%
  align(i=1, part="header", align="center") %>%
  autofit()

## Kseniya`s inputs 
write_xlsx(comb_tab, path = paste0(output_dir, "/results_S6_2.xlsx"))

```

## Table S7. Bivariate Correlation between Statistical Indexes and Key Development Indices


```{r}
#get correlations with pvalues
index_cor_full <- comparison_df %>%
  select(c('gdb','odb','ODIN_score', 'SCI', 'SPI.INDEX' ),devindex) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_full$r
index_cor <- index_cor[6:15,1:5]

#get correlation between indices
index_between_cor <- index_cor_full$r
index_between_cor <- index_between_cor[1:5,1:5]

#get Ns
index_n <- index_cor_full$n
index_n <- index_n[6:15,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_full$P
index_sig <- index_sig[6:15,1:5]

rownames(index_cor) <- devindex_names
rownames(index_sig) <- devindex_names 


tab <- as_tibble(cbind(index_cor, index_sig,var=devindex_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...10,0.05,0.1) ~   paste0(SPI...5,"*"),
            between(SPI...10,0.01,0.5) ~  paste0(SPI...5,"**"),
            between(SPI...10,0.0,0.01) ~ paste0(SPI...5,"***"),
           is.na(SPI...5) ~ "",
           TRUE ~      paste0(SPI...5,"")
            ),
         SCI=case_when(
           between(SCI...9,0.05,0.1) ~   paste0(SCI...4,"*"),
           between(SCI...9,0.01,0.5) ~  paste0(SCI...4,"**"),
           between(SCI...9,0.0,0.01) ~ paste0(SCI...4,"***"),
           is.na(SCI...4) ~ "",
           TRUE ~      paste0(SCI...4,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...7,0.05,0.1) ~   paste0(ODB...2,"*"),
           between(ODB...7,0.01,0.5) ~  paste0(ODB...2,"**"),
           between(ODB...7,0.0,0.01) ~ paste0(ODB...2,"***"),
           is.na(ODB...2) ~ "",
           TRUE ~      paste0(ODB...2,"")
         ),
         GDB=case_when(
           between(GDB...6,0.05,0.1) ~   paste0(GDB...1,"*"),
           between(GDB...6,0.01,0.5) ~  paste0(GDB...1,"**"),
           between(GDB...6,0.0,0.01) ~ paste0(GDB...1,"***"),
           is.na(GDB...1) ~ "",
           TRUE ~      paste0(GDB...1,"")
         )
    ) %>% select( var,GDB, ODB, ODIN, SCI, SPI ) 

# Create a correlation matrix
cor_matrix <- index_cor

# Function to calculate p-value from z-score
p_value_from_z <- function(z) {
  2 * (1 - pnorm(abs(z)))
}

# Iterate through the correlation matrix
n <- nrow(cor_matrix)
c <- ncol(cor_matrix)
p_values <- matrix(NA, n, c*c)  # Store p-values for each pairwise comparison

for (i in 1:n) {
  for (j in 1:c) {
    for (k in 1:c) {
      
      
      row1 <- i
      col1 <- j
      correlation_1 <- cor_matrix[row1,col1]
      
      row2 <- i
      col2 <- k
      correlation_2 <- cor_matrix[row2, col2]
      
      #between correlation
      correlation_between <- index_between_cor[col1,col2]
      
      # Step 3: Calculate standard errors of z-transformed correlations
      n_1 <- index_n[row1,col1]  # Sample size for correlation_1
      n_2 <- index_n[row2,col2]  # Sample size for correlation_2
      n_min <- min(n_1,n_2)

      
      # Step 4: Perform z-test for the difference between z-scores. pearson1898: Pearson and Filon’s [7] z
      kfactor = correlation_between*(1-correlation_1^2-correlation_2^2)-0.5*(correlation_1*correlation_2)*(1-correlation_1^2-correlation_2^2-correlation_between^2)
      z_diff <- sqrt(n_min)*(correlation_1-correlation_2)/sqrt((1-correlation_1^2)^2+(1-correlation_2^2)^2-2*kfactor)
      
      if (is.na(z_diff)) {
        z_diff=0
      }
      # Calculate p-value
      pos <- (j-1)*5+k
      pval <- p_value_from_z(z_diff)
      
      if (k==1 & pval > 0.05)  {
        txt <- "GDB"
      } else if (k==2 & pval > 0.05) {
        txt <- "ODB"
      } else if (k==3  & pval > 0.05) {
        txt <- "ODIN"
      } else if (k==4  & pval > 0.05) {
        txt <- "SCI"
      } else if (k==5  & pval > 0.05) {
        txt <- "SPI"
      } else {
        txt <- ""
      }
      p_values[i, pos] <- txt
    }
  }
}

#format as table
pval_df <- as.data.frame(p_values) %>%
  cbind(`Dev Index`=devindex_names) %>%
  select(-V1, -V7, -V13, -V19, -V25) %>%
  select(`Dev Index`, everything()) 

```



```{r}

## COmbined table

text_tab <- pval_df %>% 
  mutate(rownum=row_number(), font_small=TRUE) %>%
  mutate(GDB_text=paste(V2, V3, V4, V5, sep=", "),
         ODB_text=paste(V6, V8, V9, V10, sep=", "),
         ODIN_text=paste(V11, V12, V14, V15, sep=", "),
         SCI_text=paste(V16, V17, V18, V20, sep=", "),
         SPI_text=paste(V21, V22, V23, V24, sep=", ")) %>%
  mutate(across(ends_with('_text'), ~gsub(", ,","",.))) %>%
  select(`Dev Index`, ends_with('_text'))

comb_tab <- tab %>% 
  rename(`Dev Index`=var) %>%
  mutate(rownum=row_number()) %>%
  left_join(text_tab) %>%
  arrange(rownum) %>%
  select(`Dev Index`, starts_with("GDB"), starts_with("ODB"),
         starts_with("ODIN"), starts_with("SCI"), starts_with("SPI"))


flextable(comb_tab) %>%
  set_header_labels(
    `Dev Index`="Index",
    GDB="GDB", GDB_text="GDB",
    ODB="ODB", ODB_text="ODB",
    ODIN="ODIN", ODIN_text="ODIN",
    SCI="SCI", SCI_text="SCI",
    SPI="SPI", SPI_text="SPI"
  ) %>%
  
  merge_at(i=1,j=2:3, part = 'header') %>%
  merge_at(i=1,j=4:5, part = 'header') %>%
  merge_at(i=1,j=6:7, part = 'header') %>%
  merge_at(i=1,j=8:9, part = 'header') %>%
  merge_at(i=1,j=10:11, part = 'header') %>% 
  
  fontsize(j = c(3,5,7,9,11), size = 6, part = "body") %>%
  fontsize(j = c(1,2,4,6,8,10), size = 8, part = "body") %>%
  
  vline(j=1) %>%
  vline(j=3) %>%
  vline(j=5) %>%
  vline(j=7) %>%
  vline(j=9) %>%
  align(i=1, part="header", align="center") %>%
  autofit()

## Kseniya`s inputs 
write_xlsx(comb_tab, path = paste0(output_dir, "/results_S7.xlsx"))

```

Note: Latest year for the SPI is 2023, for the SCI is 2020, for the ODIN is 2022, for the ODB is 2017, and for the GDB is 2021.

```{r}
#| eval: false
#| include: false


#test differences between models
#step 1 combine data
combined_test_data <- bind_rows(comparison_df %>% mutate(model='SPI', index=SPI.INDEX) ,
                           comparison_df %>% mutate(model='SCI', index=SCI),
                           comparison_df %>% mutate(model='ODIN', index=ODIN_score),
                           comparison_df %>% mutate(model='ODB', index=odb),
                           comparison_df %>% mutate(model='GDB', index=gdb)
                           ) %>%
  mutate(model=factor(model, levels=c('SPI', "SCI",'ODIN','ODB','GDB')))


#model1
test_models <- list(
"Economic Complexity Index" = feols(eci_value ~  index*model + model, data=combined_test_data, se='cluster', cluster='country'),

"Environmental Performance Index" = feols(env_perform_index ~  index*model + model, data=combined_test_data, se='cluster', cluster='country'),

"OECD Better Life Index" = feols(better_life_index ~  index*model + model, data=combined_test_data, se='cluster', cluster='country'),

"UN Human Development Index" = feols(hdi_value ~  index*model + model, data=combined_test_data, se='cluster', cluster='country'),

"WB Human Capital Index" = feols(HD.HCI.OVRL ~  index*model + model, data=combined_test_data, se='cluster', cluster='country'),

"World Press Freedom Index" = feols(press_free_score ~  index*model + model, data=combined_test_data, se='cluster', cluster='country')


)

modelsummary(test_models,
             estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c('index'='SPI',
                           'index:modelSCI'='SPI-SCI',
                           'index:modelODIN' = 'SPI-ODIN',
                           'index:modelODB' = 'SPI-ODB',
                           'index:modelGDB' = 'SPI-GDB'),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )

```


```{r}
#| label: regdataprep

#this code will apply to the next 6 tables, as they rely on a constant set of predictors

#read in predictor variables created in spi_lit_review_data_preparation.Rmd
predictors_df <- read_csv( paste0(output_dir, "/SPI_regression_predictors.csv"))

#read in data for other indices


# predictors_df <- predictors_df %>%
#   left_join(imf_dta)

SPI_2016 <- spi_index_df %>%
  filter(date>=2016) %>%
  group_by(iso3c) %>%
  fill(starts_with( 'SPI.INDEX'), .direction="downup") %>%
  ungroup() %>%
  filter(!is.na(SPI.INDEX))

countries_2016 <- unique(SPI_2016$country)

#add in some metadata about the country
reg_df <- SPI_2016 %>%
  select(country, iso3c, date, region, SPI.INDEX, SPI.INDEX.PIL1, SPI.INDEX.PIL2, SPI.INDEX.PIL3, SPI.INDEX.PIL4, SPI.INDEX.PIL5) %>%
  left_join(predictors_df) %>%
  filter(between(date,2016,2023)) %>%
  filter(!(is.na(SPI.INDEX) | 
             is.na(NY.GDP.PCAP.KD) |
             is.na(NV.IND.MANF.ZS) |
             is.na(NV.AGR.TOTL.ZS) |
             is.na(SE.PRM.ENRR) |
             is.na(NE.TRD.GNFS.ZS) |
             is.na(CC.EST) |
             is.na(GE.EST) |
             is.na(PV.EST) |
             is.na(RQ.EST) |
             is.na(RL.EST) |
             is.na(VA.EST) )) %>%
  mutate(region_year_fe=paste(region,date,sep=" - "))

ncountry <- length(unique(reg_df$country))

#ODIN regressions
#add in some metadata about the country
odin_reg_df <- odin_df %>%
  select(iso3c, date, ODIN_score) %>%
  left_join(country_metadata) %>%
  left_join(predictors_df) %>%
  filter(between(date,2016,2022)) %>%
  filter(!(is.na(ODIN_score) | 
             is.na(NY.GDP.PCAP.KD) |
             is.na(NV.IND.MANF.ZS) |
             is.na(NV.AGR.TOTL.ZS) |
             is.na(SE.PRM.ENRR) |
             is.na(NE.TRD.GNFS.ZS) |
             is.na(CC.EST) |
             is.na(GE.EST) |
             is.na(PV.EST) |
             is.na(RQ.EST) |
             is.na(RL.EST) |
             is.na(VA.EST) )) %>%
  mutate(region_year_fe=paste(region,date,sep=" - "))

#ODB Regression
odb_reg_df <- read_csv(paste0(output_dir, "/ODB_formatted_data.csv")) %>%
  select(iso3c, date, odb) %>%
  left_join(country_metadata) %>%
  left_join(predictors_df) %>%
  filter(between(date,2013,2022)) %>%
  filter(!(is.na(odb) | 
             is.na(NY.GDP.PCAP.KD) |
             is.na(NV.IND.MANF.ZS) |
             is.na(NV.AGR.TOTL.ZS) |
             is.na(SE.PRM.ENRR) |
             is.na(NE.TRD.GNFS.ZS) |
             is.na(CC.EST) |
             is.na(GE.EST) |
             is.na(PV.EST) |
             is.na(RQ.EST) |
             is.na(RL.EST) |
             is.na(VA.EST) )) %>%
  mutate(region_year_fe=paste(region,date,sep=" - "))

#ODB Regression
gdb_reg_df <- read_csv(paste0(output_dir, "/GDB_formatted_data.csv")) %>%
  select(iso3c, date, gdb) %>%
  left_join(country_metadata) %>%
  left_join(predictors_df) %>%
  filter(between(date,2016,2022)) %>%
  filter(!(is.na(gdb) | 
             is.na(NY.GDP.PCAP.KD) |
             is.na(NV.IND.MANF.ZS) |
             is.na(NV.AGR.TOTL.ZS) |
             is.na(SE.PRM.ENRR) |
             is.na(NE.TRD.GNFS.ZS) |
             is.na(CC.EST) |
             is.na(GE.EST) |
             is.na(PV.EST) |
             is.na(RQ.EST) |
             is.na(RL.EST) |
             is.na(VA.EST) )) %>%
  mutate(region_year_fe=paste(region,date,sep=" - "))

#create one file and merge them together
reg_data <- reg_df %>%
  left_join(odin_reg_df %>% select(iso3c, date, ODIN_score), by=c('iso3c','date')) %>%
  left_join(odb_reg_df %>% select(iso3c, date, odb), by=c('iso3c','date')) %>%
  left_join(gdb_reg_df %>% select(iso3c, date, gdb), by=c('iso3c','date')) %>%
  select(-region_year_fe) %>%
  #convert names to stata format
  janitor::clean_names() 

#save as .dta
write_dta(reg_data, paste0(output_dir, "/spi_lit_review_reg_data.dta"))


```

```{r}


#use modelsumary to predict
#modelsummary output
gm <- tibble::tribble(
  ~raw,        ~clean,          ~fmt,
  "nobs",      "N",             0,
  'r.squared', "R<sup>2</sup>", 3)



```

## Table S7.1 Bivariate Correlation between Statistical Indexes and Key Development Indices

```{r}
#get correlations with pvalues
index_cor_lic_lmic <- comparison_df %>%
  filter(income %in% c("Low income", "Lower middle income")) %>%   
  select(c('gdb','odb','ODIN_score', 'SCI', 'SPI.INDEX' ),devindex) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_lic_lmic$r
index_cor <- index_cor[6:15,1:5]

#get correlation between indices
index_between_cor <- index_cor_lic_lmic$r
index_between_cor <- index_between_cor[1:5,1:5]

#get Ns
index_n <- index_cor_lic_lmic$n
index_n <- index_n[6:15,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_lic_lmic$P
index_sig <- index_sig[6:15,1:5]

rownames(index_cor) <- devindex_names
rownames(index_sig) <- devindex_names 


tab <- as_tibble(cbind(index_cor, index_sig,var=devindex_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...10,0.05,0.1) ~   paste0(SPI...5,"*"),
            between(SPI...10,0.01,0.5) ~  paste0(SPI...5,"**"),
            between(SPI...10,0.0,0.01) ~ paste0(SPI...5,"***"),
           is.na(SPI...5) ~ "",
           TRUE ~      paste0(SPI...5,"")
            ),
         SCI=case_when(
           between(SCI...9,0.05,0.1) ~   paste0(SCI...4,"*"),
           between(SCI...9,0.01,0.5) ~  paste0(SCI...4,"**"),
           between(SCI...9,0.0,0.01) ~ paste0(SCI...4,"***"),
           is.na(SCI...4) ~ "",
           TRUE ~      paste0(SCI...4,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...7,0.05,0.1) ~   paste0(ODB...2,"*"),
           between(ODB...7,0.01,0.5) ~  paste0(ODB...2,"**"),
           between(ODB...7,0.0,0.01) ~ paste0(ODB...2,"***"),
           is.na(ODB...2) ~ "",
           TRUE ~      paste0(ODB...2,"")
         ),
         GDB=case_when(
           between(GDB...6,0.05,0.1) ~   paste0(GDB...1,"*"),
           between(GDB...6,0.01,0.5) ~  paste0(GDB...1,"**"),
           between(GDB...6,0.0,0.01) ~ paste0(GDB...1,"***"),
           is.na(GDB...1) ~ "",
           TRUE ~      paste0(GDB...1,"")
         )
    ) %>% select( var,GDB, ODB, ODIN, SCI, SPI ) 

# Create a correlation matrix
cor_matrix <- index_cor

# Function to calculate p-value from z-score
p_value_from_z <- function(z) {
  2 * (1 - pnorm(abs(z)))
}

# Iterate through the correlation matrix
n <- nrow(cor_matrix)
c <- ncol(cor_matrix)
p_values <- matrix(NA, n, c*c)  # Store p-values for each pairwise comparison

for (i in 1:n) {
  for (j in 1:c) {
    for (k in 1:c) {
      
      
      row1 <- i
      col1 <- j
      correlation_1 <- cor_matrix[row1,col1]
      
      row2 <- i
      col2 <- k
      correlation_2 <- cor_matrix[row2, col2]
      
      #between correlation
      correlation_between <- index_between_cor[col1,col2]
      
      # Step 3: Calculate standard errors of z-transformed correlations
      n_1 <- index_n[row1,col1]  # Sample size for correlation_1
      n_2 <- index_n[row2,col2]  # Sample size for correlation_2
      n_min <- min(n_1,n_2)

      
      # Step 4: Perform z-test for the difference between z-scores. pearson1898: Pearson and Filon’s [7] z
      kfactor = correlation_between*(1-correlation_1^2-correlation_2^2)-0.5*(correlation_1*correlation_2)*(1-correlation_1^2-correlation_2^2-correlation_between^2)
      z_diff <- sqrt(n_min)*(correlation_1-correlation_2)/sqrt((1-correlation_1^2)^2+(1-correlation_2^2)^2-2*kfactor)
      
      if (is.na(z_diff)) {
        z_diff=0
      }
      # Calculate p-value
      pos <- (j-1)*5+k
      pval <- p_value_from_z(z_diff)
      
      if (k==1 & pval > 0.05)  {
        txt <- "GDB"
      } else if (k==2 & pval > 0.05) {
        txt <- "ODB"
      } else if (k==3  & pval > 0.05) {
        txt <- "ODIN"
      } else if (k==4  & pval > 0.05) {
        txt <- "SCI"
      } else if (k==5  & pval > 0.05) {
        txt <- "SPI"
      } else {
        txt <- ""
      }
      p_values[i, pos] <- txt
    }
  }
}

#format as table
pval_df <- as.data.frame(p_values) %>%
  cbind(`Dev Index`=devindex_names) %>%
  select(-V1, -V7, -V13, -V19, -V25) %>%
  select(`Dev Index`, everything()) 

```


```{r}

## COmbined table

text_tab <- pval_df %>% 
  mutate(rownum=row_number(), font_small=TRUE) %>%
  mutate(GDB_text=paste(V2, V3, V4, V5, sep=", "),
         ODB_text=paste(V6, V8, V9, V10, sep=", "),
         ODIN_text=paste(V11, V12, V14, V15, sep=", "),
         SCI_text=paste(V16, V17, V18, V20, sep=", "),
         SPI_text=paste(V21, V22, V23, V24, sep=", ")) %>%
  mutate(across(ends_with('_text'), ~gsub(", ,","",.))) %>%
  select(`Dev Index`, ends_with('_text'))

comb_tab <- tab %>% 
  rename(`Dev Index`=var) %>%
  mutate(rownum=row_number()) %>%
  left_join(text_tab) %>%
  arrange(rownum) %>%
  select(`Dev Index`, starts_with("GDB"), starts_with("ODB"),
         starts_with("ODIN"), starts_with("SCI"), starts_with("SPI"))


flextable(comb_tab) %>%
  set_header_labels(
    `Dev Index`="Index",
    GDB="GDB", GDB_text="GDB",
    ODB="ODB", ODB_text="ODB",
    ODIN="ODIN", ODIN_text="ODIN",
    SCI="SCI", SCI_text="SCI",
    SPI="SPI", SPI_text="SPI"
  ) %>%
  
  merge_at(i=1,j=2:3, part = 'header') %>%
  merge_at(i=1,j=4:5, part = 'header') %>%
  merge_at(i=1,j=6:7, part = 'header') %>%
  merge_at(i=1,j=8:9, part = 'header') %>%
  merge_at(i=1,j=10:11, part = 'header') %>% 
  
  fontsize(j = c(3,5,7,9,11), size = 6, part = "body") %>%
  fontsize(j = c(1,2,4,6,8,10), size = 8, part = "body") %>%
  
  vline(j=1) %>%
  vline(j=3) %>%
  vline(j=5) %>%
  vline(j=7) %>%
  vline(j=9) %>%
  align(i=1, part="header", align="center") %>%
  autofit()

## Kseniya`s inputs 
write_xlsx(comb_tab, path = paste0(output_dir, "/results_S7_1.xlsx"))

```


## Table S7.2 Bivariate Correlation between Statistical Indexes and Key Development Indices

```{r}
#get correlations with pvalues
index_cor_umic_hic <- comparison_df %>%
  filter(income %in% c("Upper middle income", "High income")) %>%    
  select(c('gdb','odb','ODIN_score', 'SCI', 'SPI.INDEX' ),devindex) %>%
  rename(
    SPI=SPI.INDEX,
    SCI=SCI, 
    ODIN=ODIN_score, 
    ODB=odb, 
    GDB=gdb
  ) %>%
  as.matrix() %>%
  rcorr()

#get just pearon correlation
index_cor <- index_cor_umic_hic$r
index_cor <- index_cor[6:15,1:5]

#get correlation between indices
index_between_cor <- index_cor_umic_hic$r
index_between_cor <- index_between_cor[1:5,1:5]

#get Ns
index_n <- index_cor_umic_hic$n
index_n <- index_n[6:15,1:5]


#round
index_cor <- round(index_cor,2)

#color
index_col <- index_cor
index_col[abs(index_cor)>=0.5] <- "#fcca46"
index_col[abs(index_cor)>=0.65] <- "#a1c181"
index_col[abs(index_cor)>=0.8] <- "#619b8a"
index_col[abs(index_cor)<0.5] <- "white"

#get statistical significance
index_sig <- index_cor_umic_hic$P
index_sig <- index_sig[6:15,1:5]

rownames(index_cor) <- devindex_names
rownames(index_sig) <- devindex_names 


tab <- as_tibble(cbind(index_cor, index_sig,var=devindex_names), .name_repair="universal") %>%
  mutate(across(c(1:10), as.numeric)) %>%
  mutate(SPI=case_when(
            between(SPI...10,0.05,0.1) ~   paste0(SPI...5,"*"),
            between(SPI...10,0.01,0.5) ~  paste0(SPI...5,"**"),
            between(SPI...10,0.0,0.01) ~ paste0(SPI...5,"***"),
           is.na(SPI...5) ~ "",
           TRUE ~      paste0(SPI...5,"")
            ),
         SCI=case_when(
           between(SCI...9,0.05,0.1) ~   paste0(SCI...4,"*"),
           between(SCI...9,0.01,0.5) ~  paste0(SCI...4,"**"),
           between(SCI...9,0.0,0.01) ~ paste0(SCI...4,"***"),
           is.na(SCI...4) ~ "",
           TRUE ~      paste0(SCI...4,"")
           ),
         ODIN=case_when(
           between(ODIN...8,0.05,0.1) ~   paste0(ODIN...3,"*"),
           between(ODIN...8,0.01,0.5) ~  paste0(ODIN...3,"**"),
           between(ODIN...8,0.0,0.01) ~ paste0(ODIN...3,"***"),
           is.na(ODIN...3) ~ "",
           TRUE ~      paste0(ODIN...3,"")
         ),
         ODB=case_when(
           between(ODB...7,0.05,0.1) ~   paste0(ODB...2,"*"),
           between(ODB...7,0.01,0.5) ~  paste0(ODB...2,"**"),
           between(ODB...7,0.0,0.01) ~ paste0(ODB...2,"***"),
           is.na(ODB...2) ~ "",
           TRUE ~      paste0(ODB...2,"")
         ),
         GDB=case_when(
           between(GDB...6,0.05,0.1) ~   paste0(GDB...1,"*"),
           between(GDB...6,0.01,0.5) ~  paste0(GDB...1,"**"),
           between(GDB...6,0.0,0.01) ~ paste0(GDB...1,"***"),
           is.na(GDB...1) ~ "",
           TRUE ~      paste0(GDB...1,"")
         )
    ) %>% select( var,GDB, ODB, ODIN, SCI, SPI ) 

# Create a correlation matrix
cor_matrix <- index_cor

# Function to calculate p-value from z-score
p_value_from_z <- function(z) {
  2 * (1 - pnorm(abs(z)))
}

# Iterate through the correlation matrix
n <- nrow(cor_matrix)
c <- ncol(cor_matrix)
p_values <- matrix(NA, n, c*c)  # Store p-values for each pairwise comparison

for (i in 1:n) {
  for (j in 1:c) {
    for (k in 1:c) {
      
      
      row1 <- i
      col1 <- j
      correlation_1 <- cor_matrix[row1,col1]
      
      row2 <- i
      col2 <- k
      correlation_2 <- cor_matrix[row2, col2]
      
      #between correlation
      correlation_between <- index_between_cor[col1,col2]
      
      # Step 3: Calculate standard errors of z-transformed correlations
      n_1 <- index_n[row1,col1]  # Sample size for correlation_1
      n_2 <- index_n[row2,col2]  # Sample size for correlation_2
      n_min <- min(n_1,n_2)

      
      # Step 4: Perform z-test for the difference between z-scores. pearson1898: Pearson and Filon’s [7] z
      kfactor = correlation_between*(1-correlation_1^2-correlation_2^2)-0.5*(correlation_1*correlation_2)*(1-correlation_1^2-correlation_2^2-correlation_between^2)
      z_diff <- sqrt(n_min)*(correlation_1-correlation_2)/sqrt((1-correlation_1^2)^2+(1-correlation_2^2)^2-2*kfactor)
      
      if (is.na(z_diff)) {
        z_diff=0
      }
      # Calculate p-value
      pos <- (j-1)*5+k
      pval <- p_value_from_z(z_diff)
      
      if (k==1 & pval > 0.05)  {
        txt <- "GDB"
      } else if (k==2 & pval > 0.05) {
        txt <- "ODB"
      } else if (k==3  & pval > 0.05) {
        txt <- "ODIN"
      } else if (k==4  & pval > 0.05) {
        txt <- "SCI"
      } else if (k==5  & pval > 0.05) {
        txt <- "SPI"
      } else {
        txt <- ""
      }
      p_values[i, pos] <- txt
    }
  }
}

#format as table
pval_df <- as.data.frame(p_values) %>%
  cbind(`Dev Index`=devindex_names) %>%
  select(-V1, -V7, -V13, -V19, -V25) %>%
  select(`Dev Index`, everything()) 

```

```{r}

## COmbined table

text_tab <- pval_df %>% 
  mutate(rownum=row_number(), font_small=TRUE) %>%
  mutate(GDB_text=paste(V2, V3, V4, V5, sep=", "),
         ODB_text=paste(V6, V8, V9, V10, sep=", "),
         ODIN_text=paste(V11, V12, V14, V15, sep=", "),
         SCI_text=paste(V16, V17, V18, V20, sep=", "),
         SPI_text=paste(V21, V22, V23, V24, sep=", ")) %>%
  mutate(across(ends_with('_text'), ~gsub(", ,","",.))) %>%
  select(`Dev Index`, ends_with('_text'))

comb_tab <- tab %>% 
  rename(`Dev Index`=var) %>%
  mutate(rownum=row_number()) %>%
  left_join(text_tab) %>%
  arrange(rownum) %>%
  select(`Dev Index`, starts_with("GDB"), starts_with("ODB"),
         starts_with("ODIN"), starts_with("SCI"), starts_with("SPI"))


flextable(comb_tab) %>%
  set_header_labels(
    `Dev Index`="Index",
    GDB="GDB", GDB_text="GDB",
    ODB="ODB", ODB_text="ODB",
    ODIN="ODIN", ODIN_text="ODIN",
    SCI="SCI", SCI_text="SCI",
    SPI="SPI", SPI_text="SPI"
  ) %>%
  
  merge_at(i=1,j=2:3, part = 'header') %>%
  merge_at(i=1,j=4:5, part = 'header') %>%
  merge_at(i=1,j=6:7, part = 'header') %>%
  merge_at(i=1,j=8:9, part = 'header') %>%
  merge_at(i=1,j=10:11, part = 'header') %>% 
  
  fontsize(j = c(3,5,7,9,11), size = 6, part = "body") %>%
  fontsize(j = c(1,2,4,6,8,10), size = 8, part = "body") %>%
  
  vline(j=1) %>%
  vline(j=3) %>%
  vline(j=5) %>%
  vline(j=7) %>%
  vline(j=9) %>%
  align(i=1, part="header", align="center") %>%
  autofit()

## Kseniya`s inputs 
write_xlsx(comb_tab, path = paste0(output_dir, "/results_S7_2.xlsx"))

```

## Table S8. Relationship between the SDG Index Overall Score from the 2023 Sustainable Development Report and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(sdg_index_score ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(sdg_index_score ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(sdg_index_score ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(sdg_index_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(sdg_index_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(sdg_index_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(sdg_index_score ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2))) 
summary(p)

p <- plm::plm(sdg_index_score~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(sdg_index_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(sdg_index_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```


## Table S9 Relationship between the SDG Index Overall Score from the 2023 Sustainable Development Report and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(sdg_index_score ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(sdg_index_score ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(sdg_index_score ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(sdg_index_score ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(sdg_index_score ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(sdg_index_score ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(sdg_index_score ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(sdg_index_score ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(sdg_index_score ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(sdg_index_score ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(sdg_index_score ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```


## Table S10. Relationship between the Economic Complexity Index and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(eci_value ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(eci_value ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(eci_value ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(eci_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(eci_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(eci_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(eci_value ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(eci_value~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(eci_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(eci_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```


## Table S11. Relationship between the Environmental Performance Index and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(env_perform_index ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(env_perform_index ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(env_perform_index ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(env_perform_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(env_perform_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(env_perform_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(env_perform_index ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)

p <- plm::plm(env_perform_index~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(env_perform_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(env_perform_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```

## Table S12. Relationship between the OECD Better Life Index and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(better_life_index ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(better_life_index ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(better_life_index ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(better_life_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(better_life_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(better_life_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(better_life_index ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2))) 
summary(p)

p <- plm::plm(better_life_index~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(better_life_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(better_life_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```

## Table S13. Relationship between the UN Human Development Index and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(hdi_value ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(hdi_value ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(hdi_value ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(hdi_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(hdi_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(hdi_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(hdi_value ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2))) 
summary(p)

p <- plm::plm(hdi_value~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(hdi_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(hdi_value ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```

## Kseniya input Table S13.1 Relationship between the Prevalence of Undernourishment and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(undernourish ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(undernourish ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(undernourish ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(undernourish ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(undernourish ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(undernourish ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(undernourish ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2))) 
summary(p)

p <- plm::plm(undernourish~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(undernourish ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(undernourish ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```

## Kseniya input Tables S13.2 Relationship between the Prevalence of Prevalence of Severe Food Insecurity and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(severe_food_insecurity ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(severe_food_insecurity ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(severe_food_insecurity ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(severe_food_insecurity ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(severe_food_insecurity ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(severe_food_insecurity ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}

p <- plm::plm(severe_food_insecurity ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2))) 
summary(p)

p <- plm::plm(severe_food_insecurity~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(severe_food_insecurity ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(severe_food_insecurity ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

```


## Kseniya input Tables S13.3 Relationship between the Prevalence of Prevalence of Legatum Prosperity Index and SPI scores, 2016-2023

```{r}
 
#create regression models
 SDG_models <- list(

 'Model 1' = feols(legatum_health_index ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(legatum_health_index ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(legatum_health_index ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(legatum_health_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(legatum_health_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(legatum_health_index ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
 

```

```{r}

library(plm)

# Declare panel data
pdata <- pdata.frame(reg_df, index = c('country', 'date'))

# Fit model to get summary stats
mod <- plm(legatum_health_index ~ 1, data = pdata, model = "pooling")
summary(mod)

library(dplyr)

# Overall stats
overall_mean <- mean(pdata$legatum_health_index, na.rm = TRUE)
overall_sd   <- sd(pdata$legatum_health_index, na.rm = TRUE)

# Between variation (mean by group, then SD)
between_sd <- pdata %>%
  group_by('country') %>%
  summarise(mean_index = mean(legatum_health_index, na.rm = TRUE)) %>%
  summarise(sd = sd(mean_index, na.rm = TRUE)) %>%
  pull(sd)

# Within variation (remove group mean from each observation)
within_sd <- pdata %>%
  group_by(country) %>%
  mutate(centered = legatum_health_index - mean(legatum_health_index, na.rm = TRUE)) %>%
  summarise(sd = sd(centered, na.rm = TRUE)) %>%
  summarise(mean_sd = mean(sd, na.rm = TRUE)) %>%
  pull(mean_sd)

# Print results like xtsum
cat("xtsum-like output for index:\n")
cat("Overall mean:", round(overall_mean, 2), "\n")
cat("Overall SD:", round(overall_sd, 2), "\n")
cat("Between SD:", round(between_sd, 2), "\n")
cat("Within SD:", round(within_sd, 2), "\n")




```

## Kseniya input Tables S13.4 Relationship between the Global Health Security Index and SPI scores, 2016-2023

```{r}
 
#create regression models
 SDG_models <- list(

 'Model 1' = feols(ghsindex ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(ghsindex ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(ghsindex ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(ghsindex ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(ghsindex ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(ghsindex ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(ghsindex ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2))) 
summary(p)

p <- plm::plm(ghsindex~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(ghsindex ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(ghsindex ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```


## Table S14. Relationship between the WB Human Capital Index and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(HD.HCI.OVRL ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(HD.HCI.OVRL ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(HD.HCI.OVRL ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(HD.HCI.OVRL ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(HD.HCI.OVRL ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(HD.HCI.OVRL ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(HD.HCI.OVRL ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)

p <- plm::plm(HD.HCI.OVRL~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(HD.HCI.OVRL ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(HD.HCI.OVRL ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```

## Table S15. Relationship between the World Press Freedom Index and SPI scores, 2016-2023

```{r}

#create regression models
 SDG_models <- list(

 'Model 1' = feols(press_free_score ~  SPI.INDEX , data=reg_df, se='cluster', cluster='country') ,


 'Model 2' = feols(press_free_score ~  SPI.INDEX  + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 3' = feols(press_free_score ~  SPI.INDEX  + log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country'),
 
 'Model 4' = feols(press_free_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 , data=reg_df, se='cluster', cluster='country') ,


 'Model 5' = feols(press_free_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5    + factor(date) | country, data=reg_df, se='cluster', cluster='country') ,

 'Model 6' = feols(press_free_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5  +   log(NY.GDP.PCAP.KD) + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR    + factor(date)| country , data=reg_df, se='cluster', cluster='country')
 

 )
 modelsummary(SDG_models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                              "log(NY.GDP.PCAP.KD)" = "Log GDP per capita (constant 2015 US$)",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes="Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data are from the World Bank's World Development Indicators (WDI) and SPI. In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(press_free_score ~  SPI.INDEX, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2))) 
summary(p)

p <- plm::plm(press_free_score~  SPI.INDEX + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(press_free_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5, index=c('country', 'date'), data=reg_df, model='random') 
print(paste0("Model 5: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(press_free_score ~  SPI.INDEX.PIL1 + SPI.INDEX.PIL2 + SPI.INDEX.PIL3 + SPI.INDEX.PIL4 + SPI.INDEX.PIL5 + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=reg_df, model='random')
print(paste0("Model 6: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))


```


## Table S16. Relationship between the Economic Complexity Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(eci_value ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(eci_value ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(eci_value ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(eci_value ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(eci_value ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(eci_value ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(eci_value ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(eci_value ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(eci_value ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(eci_value ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(eci_value ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```


## Table S17 Relationship between the Environmental Performance Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(env_perform_index ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(env_perform_index ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(env_perform_index ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(env_perform_index ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(env_perform_index ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(env_perform_index ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(env_perform_index ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(env_perform_index ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(env_perform_index ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(env_perform_index ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 


print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(env_perform_index ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

```

## Table S18. Relationship between the OECD Better Life Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(better_life_index ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(better_life_index ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(better_life_index ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(better_life_index ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(better_life_index ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(better_life_index ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(better_life_index ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(better_life_index ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(better_life_index ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(better_life_index ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(better_life_index ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```


## Table S19. Relationship between the UN Human Development Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(hdi_value ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(hdi_value ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(hdi_value ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(hdi_value ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(hdi_value ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(hdi_value ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(hdi_value ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(hdi_value ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(hdi_value ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(hdi_value ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(hdi_value ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```

## Table S19.1 Relationship between the Prevalence of Undernourishment and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(undernourish ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(undernourish ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(undernourish ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(undernourish ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(undernourish ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(undernourish ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(undernourish ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(undernourish ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(undernourish ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(undernourish ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(undernourish ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```


## Table S19.2 Relationship between the Severe Food Insecurity and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(severe_food_insecurity ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(severe_food_insecurity ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(severe_food_insecurity ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(severe_food_insecurity ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(severe_food_insecurity ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(severe_food_insecurity ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(severe_food_insecurity ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(severe_food_insecurity ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(severe_food_insecurity ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(severe_food_insecurity ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(severe_food_insecurity ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```


## Table S19.3 Relationship between the Legatum Prosperity Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(legatum_health_index ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(legatum_health_index ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(legatum_health_index ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(legatum_health_index ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(legatum_health_index ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(legatum_health_index ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(legatum_health_index ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
#p <- plm::plm(legatum_health_index ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
#print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
#summary(p)

p <- plm::plm(legatum_health_index ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

#p <- plm::plm(legatum_health_index ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
#print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
#summary(p)

p <- plm::plm(legatum_health_index ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```

## Table S19.4 Relationship between the Global Health Security Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(ghsindex ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(ghsindex ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(ghsindex ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(ghsindex ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(ghsindex ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(ghsindex ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(ghsindex ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(ghsindex ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)

p <- plm::plm(ghsindex ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

#p <- plm::plm(ghsindex ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
#print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
#summary(p)

p <- plm::plm(ghsindex ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```

## Table S20. Relationship between the WB Human Capital Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(HD.HCI.OVRL ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(HD.HCI.OVRL ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(HD.HCI.OVRL ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(HD.HCI.OVRL ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(HD.HCI.OVRL ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(HD.HCI.OVRL ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(HD.HCI.OVRL ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(HD.HCI.OVRL ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(HD.HCI.OVRL ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(HD.HCI.OVRL ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(HD.HCI.OVRL ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```

## Table S21. Relationship between the World Press Freedom Index and ODIN, Open Data Barometer, and Global Data Barometer scores, 2013-2022

```{r}
#create regression models
 models <- list(

 'ODIN - Model 1' = feols(press_free_score ~  ODIN_score , data=odin_reg_df, se='cluster', cluster='country') ,


 'ODIN - Model 2' = feols(press_free_score ~  ODIN_score  + factor(date) | country, data=odin_reg_df, se='cluster', cluster='country') ,

 'ODIN - Model 3' = feols(press_free_score ~  ODIN_score  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odin_reg_df, se='cluster', cluster='country'),

  'ODB - Model 1' = feols(press_free_score ~  odb , data=odb_reg_df, se='cluster', cluster='country') ,


 'ODB - Model 2' = feols(press_free_score ~  odb  + factor(date) | country, data=odb_reg_df, se='cluster', cluster='country') ,

 'ODB - Model 3' = feols(press_free_score ~  odb  + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD) + factor(date)| country , data=odb_reg_df, se='cluster', cluster='country'),
 
  'GDB - Model 1' = feols(press_free_score ~  gdb , data=gdb_reg_df, se='cluster', cluster='country') 
)

 modelsummary(models,
              estimate= "{estimate}{stars}",
              #vcov=list(NULL,"HC1",NULL),
              coef_map = c("SPI.INDEX"  = "Overall SPI Score",
                              "SPI.INDEX.PIL1"  = "SPI Pillar 1 Score (Data use)",
                              "SPI.INDEX.PIL2"  = "SPI Pillar 2 Score (Data services)",
                              "SPI.INDEX.PIL3"  = "SPI Pillar 3 Score (Data products)",
                              "SPI.INDEX.PIL4"  = "SPI Pillar 4 Score (Data sources)",
                              "SPI.INDEX.PIL5"  = "SPI Pillar 5 Score (Data infrastructure)",
                           "ODIN_score" = "ODIN Score",
                           "odb" = "Open Data Barometer Score",
                           "gdb" = "Global Data Barometer Score",
                              "NE.TRD.GNFS.ZS" = "Trade (% of GDP)",
                              "NV.AGR.TOTL.ZS"= "Agriculture, forestry, fishing value added (% of GDP)",
                              "NV.IND.MANF.ZS"  = "Manufacturing value added (% of GDP)",
                              "SE.PRM.ENRR" = "School Enrollment, Primary (% gross)" ,
                              "WGI.OVL" = "WGI Index",
                              "factor(date)2013"="Year 2013",
                              "factor(date)2014"="Year 2014",
                              "factor(date)2015"="Year 2015",
                              "factor(date)2016"="Year 2016",
                              "factor(date)2017"="Year 2017",
                              "factor(date)2018"="Year 2018",
                              "factor(date)2019"="Year 2019",
                              "factor(date)2020"="Year 2020",
                              "factor(date)2021"="Year 2021",
                              "factor(date)2022"="Year 2022",
                              "factor(date)2023"="Year 2023",
                              "(Intercept)" = "Constant",
                              "CC.EST" = "WGI: Control of Corruption: Estimate",
                              "GE.EST" = "WGI: Governance Effectiveness: Estimate",
                              "PV.EST" = "WGI: Political Stability and Absence of Violence/Terrorism: Estimate",
                              "RQ.EST" = "WGI: Regulatory Quality: Estimate",
                              "RL.EST" = "WGI: Rule of Law: Estimate",
                              "VA.EST" = "WGI: Voice and Accountability: Estimate"),
              notes=" Note: * p<0.10, ** p<0.05, *** p<0.01. Standard errors are clustered at the country level. Data from the World Bank's World Development Indicators (WDI), Open Data Watch (ODIN), Global Data Barometer (GDB), and Open Data Barometer (ODB). In cases where data are missing for a particular covariate, the data are imputed forward using the nearest available value. Estimates with country fixed effects not available for the Global Data Barometer, because the indicator contains on ly one time period.",
              stars = c('*' = .1, '**' = .05, '***' = .01),
              fmt = fmt_statistic("estimate" = 3, "std.error" = 2),
              escape = FALSE
              )
```

```{r}
p <- plm::plm(press_free_score ~  ODIN_score, index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
summary(p)
p <- plm::plm(press_free_score ~  ODIN_score + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odin_reg_df, model='random') 
print(paste0("ODIN Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))

p <- plm::plm(press_free_score ~  odb, index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 2: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
p <- plm::plm(press_free_score ~  odb + NE.TRD.GNFS.ZS + NV.IND.MANF.ZS + NV.AGR.TOTL.ZS +  SE.PRM.ENRR   + log(NY.GDP.PCAP.KD), index=c('country', 'date'), data=odb_reg_df, model='random') 
print(paste0("ODB Model 3: sigma_e=",round(sqrt(p$ercomp$sigma2[1]),2), ". sigma_u=",round(sqrt(p$ercomp$sigma2[2]),2)))
```
